OpenTTD Source  20241120-master-g6d3adc6169
string.cpp
Go to the documentation of this file.
1 /*
2  * This file is part of OpenTTD.
3  * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
4  * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
5  * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
6  */
7 
10 #include "stdafx.h"
11 #include "debug.h"
12 #include "core/alloc_func.hpp"
13 #include "core/math_func.hpp"
14 #include "error_func.h"
15 #include "string_func.h"
16 #include "string_base.h"
17 
18 #include "table/control_codes.h"
19 
20 #include <sstream>
21 #include <iomanip>
22 
23 #ifdef _MSC_VER
24 # define strncasecmp strnicmp
25 #endif
26 
27 #ifdef _WIN32
28 # include "os/windows/win32.h"
29 #endif
30 
31 #ifdef WITH_UNISCRIBE
33 #endif
34 
35 #ifdef WITH_ICU_I18N
36 /* Required by StrNaturalCompare. */
37 # include <unicode/ustring.h>
38 # include "language.h"
39 # include "gfx_func.h"
40 #endif /* WITH_ICU_I18N */
41 
42 #if defined(WITH_COCOA)
43 # include "os/macosx/string_osx.h"
44 #endif
45 
46 #include "safeguards.h"
47 
48 
60 void strecpy(std::span<char> dst, std::string_view src)
61 {
62  /* Ensure source string fits with NUL terminator; dst must be at least 1 character longer than src. */
63  if (std::empty(dst) || std::size(src) >= std::size(dst) - 1U) {
64 #if defined(STRGEN) || defined(SETTINGSGEN)
65  FatalError("String too long for destination buffer");
66 #else /* STRGEN || SETTINGSGEN */
67  Debug(misc, 0, "String too long for destination buffer");
68  src = src.substr(0, std::size(dst) - 1U);
69 #endif /* STRGEN || SETTINGSGEN */
70  }
71 
72  auto it = std::copy(std::begin(src), std::end(src), std::begin(dst));
73  *it = '\0';
74 }
75 
81 std::string FormatArrayAsHex(std::span<const uint8_t> data)
82 {
83  std::string str;
84  str.reserve(data.size() * 2 + 1);
85 
86  for (auto b : data) {
87  fmt::format_to(std::back_inserter(str), "{:02X}", b);
88  }
89 
90  return str;
91 }
92 
93 
106 template <class T>
107 static void StrMakeValid(T &dst, const char *str, const char *last, StringValidationSettings settings)
108 {
109  /* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
110 
111  while (str <= last && *str != '\0') {
112  size_t len = Utf8EncodedCharLen(*str);
113  char32_t c;
114  /* If the first byte does not look like the first byte of an encoded
115  * character, i.e. encoded length is 0, then this byte is definitely bad
116  * and it should be skipped.
117  * When the first byte looks like the first byte of an encoded character,
118  * then the remaining bytes in the string are checked whether the whole
119  * encoded character can be there. If that is not the case, this byte is
120  * skipped.
121  * Finally we attempt to decode the encoded character, which does certain
122  * extra validations to see whether the correct number of bytes were used
123  * to encode the character. If that is not the case, the byte is probably
124  * invalid and it is skipped. We could emit a question mark, but then the
125  * logic below cannot just copy bytes, it would need to re-encode the
126  * decoded characters as the length in bytes may have changed.
127  *
128  * The goals here is to get as much valid Utf8 encoded characters from the
129  * source string to the destination string.
130  *
131  * Note: a multi-byte encoded termination ('\0') will trigger the encoded
132  * char length and the decoded length to differ, so it will be ignored as
133  * invalid character data. If it were to reach the termination, then we
134  * would also reach the "last" byte of the string and a normal '\0'
135  * termination will be placed after it.
136  */
137  if (len == 0 || str + len > last + 1 || len != Utf8Decode(&c, str)) {
138  /* Maybe the next byte is still a valid character? */
139  str++;
140  continue;
141  }
142 
143  if ((IsPrintable(c) && (c < SCC_SPRITE_START || c > SCC_SPRITE_END)) || ((settings & SVS_ALLOW_CONTROL_CODE) != 0 && c == SCC_ENCODED)) {
144  /* Copy the character back. Even if dst is current the same as str
145  * (i.e. no characters have been changed) this is quicker than
146  * moving the pointers ahead by len */
147  do {
148  *dst++ = *str++;
149  } while (--len != 0);
150  } else if ((settings & SVS_ALLOW_NEWLINE) != 0 && c == '\n') {
151  *dst++ = *str++;
152  } else {
153  if ((settings & SVS_ALLOW_NEWLINE) != 0 && c == '\r' && str[1] == '\n') {
154  str += len;
155  continue;
156  }
157  str += len;
158  if ((settings & SVS_REPLACE_TAB_CR_NL_WITH_SPACE) != 0 && (c == '\r' || c == '\n' || c == '\t')) {
159  /* Replace the tab, carriage return or newline with a space. */
160  *dst++ = ' ';
161  } else if ((settings & SVS_REPLACE_WITH_QUESTION_MARK) != 0) {
162  /* Replace the undesirable character with a question mark */
163  *dst++ = '?';
164  }
165  }
166  }
167 
168  /* String termination, if needed, is left to the caller of this function. */
169 }
170 
178 void StrMakeValidInPlace(char *str, const char *last, StringValidationSettings settings)
179 {
180  char *dst = str;
181  StrMakeValid(dst, str, last, settings);
182  *dst = '\0';
183 }
184 
193 {
194  /* We know it is '\0' terminated. */
195  StrMakeValidInPlace(str, str + strlen(str), settings);
196 }
197 
205 std::string StrMakeValid(std::string_view str, StringValidationSettings settings)
206 {
207  if (str.empty()) return {};
208 
209  auto buf = str.data();
210  auto last = buf + str.size() - 1;
211 
212  std::ostringstream dst;
213  std::ostreambuf_iterator<char> dst_iter(dst);
214  StrMakeValid(dst_iter, buf, last, settings);
215 
216  return dst.str();
217 }
218 
227 bool StrValid(std::span<const char> str)
228 {
229  /* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
230  auto it = std::begin(str);
231  auto last = std::prev(std::end(str));
232 
233  while (it <= last && *it != '\0') {
234  size_t len = Utf8EncodedCharLen(*it);
235  /* Encoded length is 0 if the character isn't known.
236  * The length check is needed to prevent Utf8Decode to read
237  * over the terminating '\0' if that happens to be placed
238  * within the encoding of an UTF8 character. */
239  if (len == 0 || it + len > last) return false;
240 
241  char32_t c;
242  len = Utf8Decode(&c, &*it);
243  if (!IsPrintable(c) || (c >= SCC_SPRITE_START && c <= SCC_SPRITE_END)) {
244  return false;
245  }
246 
247  it += len;
248  }
249 
250  return *it == '\0';
251 }
252 
260 void StrTrimInPlace(std::string &str)
261 {
262  str = StrTrimView(str);
263 }
264 
265 std::string_view StrTrimView(std::string_view str)
266 {
267  size_t first_pos = str.find_first_not_of(' ');
268  if (first_pos == std::string::npos) {
269  return std::string_view{};
270  }
271  size_t last_pos = str.find_last_not_of(' ');
272  return str.substr(first_pos, last_pos - first_pos + 1);
273 }
274 
281 bool StrStartsWithIgnoreCase(std::string_view str, const std::string_view prefix)
282 {
283  if (str.size() < prefix.size()) return false;
284  return StrEqualsIgnoreCase(str.substr(0, prefix.size()), prefix);
285 }
286 
288 struct CaseInsensitiveCharTraits : public std::char_traits<char> {
289  static bool eq(char c1, char c2) { return toupper(c1) == toupper(c2); }
290  static bool ne(char c1, char c2) { return toupper(c1) != toupper(c2); }
291  static bool lt(char c1, char c2) { return toupper(c1) < toupper(c2); }
292 
293  static int compare(const char *s1, const char *s2, size_t n)
294  {
295  while (n-- != 0) {
296  if (toupper(*s1) < toupper(*s2)) return -1;
297  if (toupper(*s1) > toupper(*s2)) return 1;
298  ++s1; ++s2;
299  }
300  return 0;
301  }
302 
303  static const char *find(const char *s, size_t n, char a)
304  {
305  for (; n > 0; --n, ++s) {
306  if (toupper(*s) == toupper(a)) return s;
307  }
308  return nullptr;
309  }
310 };
311 
313 typedef std::basic_string_view<char, CaseInsensitiveCharTraits> CaseInsensitiveStringView;
314 
321 bool StrEndsWithIgnoreCase(std::string_view str, const std::string_view suffix)
322 {
323  if (str.size() < suffix.size()) return false;
324  return StrEqualsIgnoreCase(str.substr(str.size() - suffix.size()), suffix);
325 }
326 
334 int StrCompareIgnoreCase(const std::string_view str1, const std::string_view str2)
335 {
336  CaseInsensitiveStringView ci_str1{ str1.data(), str1.size() };
337  CaseInsensitiveStringView ci_str2{ str2.data(), str2.size() };
338  return ci_str1.compare(ci_str2);
339 }
340 
347 bool StrEqualsIgnoreCase(const std::string_view str1, const std::string_view str2)
348 {
349  if (str1.size() != str2.size()) return false;
350  return StrCompareIgnoreCase(str1, str2) == 0;
351 }
352 
359 size_t Utf8StringLength(const char *s)
360 {
361  size_t len = 0;
362  const char *t = s;
363  while (Utf8Consume(&t) != 0) len++;
364  return len;
365 }
366 
373 size_t Utf8StringLength(const std::string &str)
374 {
375  return Utf8StringLength(str.c_str());
376 }
377 
378 bool strtolower(std::string &str, std::string::size_type offs)
379 {
380  bool changed = false;
381  for (auto ch = str.begin() + offs; ch != str.end(); ++ch) {
382  auto new_ch = static_cast<char>(tolower(static_cast<unsigned char>(*ch)));
383  changed |= new_ch != *ch;
384  *ch = new_ch;
385  }
386  return changed;
387 }
388 
396 bool IsValidChar(char32_t key, CharSetFilter afilter)
397 {
398  switch (afilter) {
399  case CS_ALPHANUMERAL: return IsPrintable(key);
400  case CS_NUMERAL: return (key >= '0' && key <= '9');
401  case CS_NUMERAL_SPACE: return (key >= '0' && key <= '9') || key == ' ';
402  case CS_NUMERAL_SIGNED: return (key >= '0' && key <= '9') || key == '-';
403  case CS_ALPHA: return IsPrintable(key) && !(key >= '0' && key <= '9');
404  case CS_HEXADECIMAL: return (key >= '0' && key <= '9') || (key >= 'a' && key <= 'f') || (key >= 'A' && key <= 'F');
405  default: NOT_REACHED();
406  }
407 }
408 
409 
410 /* UTF-8 handling routines */
411 
412 
419 size_t Utf8Decode(char32_t *c, const char *s)
420 {
421  assert(c != nullptr);
422 
423  if (!HasBit(s[0], 7)) {
424  /* Single byte character: 0xxxxxxx */
425  *c = s[0];
426  return 1;
427  } else if (GB(s[0], 5, 3) == 6) {
428  if (IsUtf8Part(s[1])) {
429  /* Double byte character: 110xxxxx 10xxxxxx */
430  *c = GB(s[0], 0, 5) << 6 | GB(s[1], 0, 6);
431  if (*c >= 0x80) return 2;
432  }
433  } else if (GB(s[0], 4, 4) == 14) {
434  if (IsUtf8Part(s[1]) && IsUtf8Part(s[2])) {
435  /* Triple byte character: 1110xxxx 10xxxxxx 10xxxxxx */
436  *c = GB(s[0], 0, 4) << 12 | GB(s[1], 0, 6) << 6 | GB(s[2], 0, 6);
437  if (*c >= 0x800) return 3;
438  }
439  } else if (GB(s[0], 3, 5) == 30) {
440  if (IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) {
441  /* 4 byte character: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
442  *c = GB(s[0], 0, 3) << 18 | GB(s[1], 0, 6) << 12 | GB(s[2], 0, 6) << 6 | GB(s[3], 0, 6);
443  if (*c >= 0x10000 && *c <= 0x10FFFF) return 4;
444  }
445  }
446 
447  *c = '?';
448  return 1;
449 }
450 
451 
459 template <class T>
460 inline size_t Utf8Encode(T buf, char32_t c)
461 {
462  if (c < 0x80) {
463  *buf = c;
464  return 1;
465  } else if (c < 0x800) {
466  *buf++ = 0xC0 + GB(c, 6, 5);
467  *buf = 0x80 + GB(c, 0, 6);
468  return 2;
469  } else if (c < 0x10000) {
470  *buf++ = 0xE0 + GB(c, 12, 4);
471  *buf++ = 0x80 + GB(c, 6, 6);
472  *buf = 0x80 + GB(c, 0, 6);
473  return 3;
474  } else if (c < 0x110000) {
475  *buf++ = 0xF0 + GB(c, 18, 3);
476  *buf++ = 0x80 + GB(c, 12, 6);
477  *buf++ = 0x80 + GB(c, 6, 6);
478  *buf = 0x80 + GB(c, 0, 6);
479  return 4;
480  }
481 
482  *buf = '?';
483  return 1;
484 }
485 
486 size_t Utf8Encode(char *buf, char32_t c)
487 {
488  return Utf8Encode<char *>(buf, c);
489 }
490 
491 size_t Utf8Encode(std::ostreambuf_iterator<char> &buf, char32_t c)
492 {
493  return Utf8Encode<std::ostreambuf_iterator<char> &>(buf, c);
494 }
495 
496 size_t Utf8Encode(std::back_insert_iterator<std::string> &buf, char32_t c)
497 {
498  return Utf8Encode<std::back_insert_iterator<std::string> &>(buf, c);
499 }
500 
508 size_t Utf8TrimString(char *s, size_t maxlen)
509 {
510  size_t length = 0;
511 
512  for (const char *ptr = strchr(s, '\0'); *s != '\0';) {
513  size_t len = Utf8EncodedCharLen(*s);
514  /* Silently ignore invalid UTF8 sequences, our only concern trimming */
515  if (len == 0) len = 1;
516 
517  /* Take care when a hard cutoff was made for the string and
518  * the last UTF8 sequence is invalid */
519  if (length + len >= maxlen || (s + len > ptr)) break;
520  s += len;
521  length += len;
522  }
523 
524  *s = '\0';
525  return length;
526 }
527 
528 #ifdef DEFINE_STRCASESTR
529 char *strcasestr(const char *haystack, const char *needle)
530 {
531  size_t hay_len = strlen(haystack);
532  size_t needle_len = strlen(needle);
533  while (hay_len >= needle_len) {
534  if (strncasecmp(haystack, needle, needle_len) == 0) return const_cast<char *>(haystack);
535 
536  haystack++;
537  hay_len--;
538  }
539 
540  return nullptr;
541 }
542 #endif /* DEFINE_STRCASESTR */
543 
549 static bool IsGarbageCharacter(char32_t c)
550 {
551  if (c >= '0' && c <= '9') return false;
552  if (c >= 'A' && c <= 'Z') return false;
553  if (c >= 'a' && c <= 'z') return false;
554  if (c >= SCC_CONTROL_START && c <= SCC_CONTROL_END) return true;
555  if (c >= 0xC0 && c <= 0x10FFFF) return false;
556 
557  return true;
558 }
559 
568 static std::string_view SkipGarbage(std::string_view str)
569 {
570  auto first = std::begin(str);
571  auto last = std::end(str);
572  while (first < last) {
573  char32_t c;
574  size_t len = Utf8Decode(&c, &*first);
575  if (!IsGarbageCharacter(c)) break;
576  first += len;
577  }
578  return {first, last};
579 }
580 
589 int StrNaturalCompare(std::string_view s1, std::string_view s2, bool ignore_garbage_at_front)
590 {
591  if (ignore_garbage_at_front) {
592  s1 = SkipGarbage(s1);
593  s2 = SkipGarbage(s2);
594  }
595 
596 #ifdef WITH_ICU_I18N
597  if (_current_collator) {
598  UErrorCode status = U_ZERO_ERROR;
599  int result = _current_collator->compareUTF8(icu::StringPiece(s1.data(), s1.size()), icu::StringPiece(s2.data(), s2.size()), status);
600  if (U_SUCCESS(status)) return result;
601  }
602 #endif /* WITH_ICU_I18N */
603 
604 #if defined(_WIN32) && !defined(STRGEN) && !defined(SETTINGSGEN)
605  int res = OTTDStringCompare(s1, s2);
606  if (res != 0) return res - 2; // Convert to normal C return values.
607 #endif
608 
609 #if defined(WITH_COCOA) && !defined(STRGEN) && !defined(SETTINGSGEN)
610  int res = MacOSStringCompare(s1, s2);
611  if (res != 0) return res - 2; // Convert to normal C return values.
612 #endif
613 
614  /* Do a normal comparison if ICU is missing or if we cannot create a collator. */
615  return StrCompareIgnoreCase(s1, s2);
616 }
617 
618 #ifdef WITH_ICU_I18N
619 
620 #include <unicode/stsearch.h>
621 
630 static int ICUStringContains(const std::string_view str, const std::string_view value, bool case_insensitive)
631 {
632  if (_current_collator) {
633  std::unique_ptr<icu::RuleBasedCollator> coll(dynamic_cast<icu::RuleBasedCollator *>(_current_collator->clone()));
634  if (coll) {
635  UErrorCode status = U_ZERO_ERROR;
636  coll->setStrength(case_insensitive ? icu::Collator::SECONDARY : icu::Collator::TERTIARY);
637  coll->setAttribute(UCOL_NUMERIC_COLLATION, UCOL_OFF, status);
638 
639  auto u_str = icu::UnicodeString::fromUTF8(icu::StringPiece(str.data(), str.size()));
640  auto u_value = icu::UnicodeString::fromUTF8(icu::StringPiece(value.data(), value.size()));
641  icu::StringSearch u_searcher(u_value, u_str, coll.get(), nullptr, status);
642  if (U_SUCCESS(status)) {
643  auto pos = u_searcher.first(status);
644  if (U_SUCCESS(status)) return pos != USEARCH_DONE ? 1 : 0;
645  }
646  }
647  }
648 
649  return -1;
650 }
651 #endif /* WITH_ICU_I18N */
652 
660 [[nodiscard]] bool StrNaturalContains(const std::string_view str, const std::string_view value)
661 {
662 #ifdef WITH_ICU_I18N
663  int res_u = ICUStringContains(str, value, false);
664  if (res_u >= 0) return res_u > 0;
665 #endif /* WITH_ICU_I18N */
666 
667 #if defined(_WIN32) && !defined(STRGEN) && !defined(SETTINGSGEN)
668  int res = Win32StringContains(str, value, false);
669  if (res >= 0) return res > 0;
670 #endif
671 
672 #if defined(WITH_COCOA) && !defined(STRGEN) && !defined(SETTINGSGEN)
673  int res = MacOSStringContains(str, value, false);
674  if (res >= 0) return res > 0;
675 #endif
676 
677  return str.find(value) != std::string_view::npos;
678 }
679 
687 [[nodiscard]] bool StrNaturalContainsIgnoreCase(const std::string_view str, const std::string_view value)
688 {
689 #ifdef WITH_ICU_I18N
690  int res_u = ICUStringContains(str, value, true);
691  if (res_u >= 0) return res_u > 0;
692 #endif /* WITH_ICU_I18N */
693 
694 #if defined(_WIN32) && !defined(STRGEN) && !defined(SETTINGSGEN)
695  int res = Win32StringContains(str, value, true);
696  if (res >= 0) return res > 0;
697 #endif
698 
699 #if defined(WITH_COCOA) && !defined(STRGEN) && !defined(SETTINGSGEN)
700  int res = MacOSStringContains(str, value, true);
701  if (res >= 0) return res > 0;
702 #endif
703 
704  CaseInsensitiveStringView ci_str{ str.data(), str.size() };
705  CaseInsensitiveStringView ci_value{ value.data(), value.size() };
706  return ci_str.find(ci_value) != CaseInsensitiveStringView::npos;
707 }
708 
715 static int ConvertHexNibbleToByte(char c)
716 {
717  if (c >= '0' && c <= '9') return c - '0';
718  if (c >= 'A' && c <= 'F') return c + 10 - 'A';
719  if (c >= 'a' && c <= 'f') return c + 10 - 'a';
720  return -1;
721 }
722 
734 bool ConvertHexToBytes(std::string_view hex, std::span<uint8_t> bytes)
735 {
736  if (bytes.size() != hex.size() / 2) {
737  return false;
738  }
739 
740  /* Hex-string lengths are always divisible by 2. */
741  if (hex.size() % 2 != 0) {
742  return false;
743  }
744 
745  for (size_t i = 0; i < hex.size() / 2; i++) {
746  auto hi = ConvertHexNibbleToByte(hex[i * 2]);
747  auto lo = ConvertHexNibbleToByte(hex[i * 2 + 1]);
748 
749  if (hi < 0 || lo < 0) {
750  return false;
751  }
752 
753  bytes[i] = (hi << 4) | lo;
754  }
755 
756  return true;
757 }
758 
759 #ifdef WITH_UNISCRIBE
760 
761 /* static */ std::unique_ptr<StringIterator> StringIterator::Create()
762 {
763  return std::make_unique<UniscribeStringIterator>();
764 }
765 
766 #elif defined(WITH_ICU_I18N)
767 
768 #include <unicode/utext.h>
769 #include <unicode/brkiter.h>
770 
773 {
774  icu::BreakIterator *char_itr;
775  icu::BreakIterator *word_itr;
776 
777  std::vector<UChar> utf16_str;
778  std::vector<size_t> utf16_to_utf8;
779 
780 public:
781  IcuStringIterator() : char_itr(nullptr), word_itr(nullptr)
782  {
783  UErrorCode status = U_ZERO_ERROR;
784  this->char_itr = icu::BreakIterator::createCharacterInstance(icu::Locale(_current_language != nullptr ? _current_language->isocode : "en"), status);
785  this->word_itr = icu::BreakIterator::createWordInstance(icu::Locale(_current_language != nullptr ? _current_language->isocode : "en"), status);
786 
787  this->utf16_str.push_back('\0');
788  this->utf16_to_utf8.push_back(0);
789  }
790 
791  ~IcuStringIterator() override
792  {
793  delete this->char_itr;
794  delete this->word_itr;
795  }
796 
797  void SetString(const char *s) override
798  {
799  const char *string_base = s;
800 
801  /* Unfortunately current ICU versions only provide rudimentary support
802  * for word break iterators (especially for CJK languages) in combination
803  * with UTF-8 input. As a work around we have to convert the input to
804  * UTF-16 and create a mapping back to UTF-8 character indices. */
805  this->utf16_str.clear();
806  this->utf16_to_utf8.clear();
807 
808  while (*s != '\0') {
809  size_t idx = s - string_base;
810 
811  char32_t c = Utf8Consume(&s);
812  if (c < 0x10000) {
813  this->utf16_str.push_back((UChar)c);
814  } else {
815  /* Make a surrogate pair. */
816  this->utf16_str.push_back((UChar)(0xD800 + ((c - 0x10000) >> 10)));
817  this->utf16_str.push_back((UChar)(0xDC00 + ((c - 0x10000) & 0x3FF)));
818  this->utf16_to_utf8.push_back(idx);
819  }
820  this->utf16_to_utf8.push_back(idx);
821  }
822  this->utf16_str.push_back('\0');
823  this->utf16_to_utf8.push_back(s - string_base);
824 
825  UText text = UTEXT_INITIALIZER;
826  UErrorCode status = U_ZERO_ERROR;
827  utext_openUChars(&text, this->utf16_str.data(), this->utf16_str.size() - 1, &status);
828  this->char_itr->setText(&text, status);
829  this->word_itr->setText(&text, status);
830  this->char_itr->first();
831  this->word_itr->first();
832  }
833 
834  size_t SetCurPosition(size_t pos) override
835  {
836  /* Convert incoming position to an UTF-16 string index. */
837  uint utf16_pos = 0;
838  for (uint i = 0; i < this->utf16_to_utf8.size(); i++) {
839  if (this->utf16_to_utf8[i] == pos) {
840  utf16_pos = i;
841  break;
842  }
843  }
844 
845  /* isBoundary has the documented side-effect of setting the current
846  * position to the first valid boundary equal to or greater than
847  * the passed value. */
848  this->char_itr->isBoundary(utf16_pos);
849  return this->utf16_to_utf8[this->char_itr->current()];
850  }
851 
852  size_t Next(IterType what) override
853  {
854  int32_t pos;
855  switch (what) {
856  case ITER_CHARACTER:
857  pos = this->char_itr->next();
858  break;
859 
860  case ITER_WORD:
861  pos = this->word_itr->following(this->char_itr->current());
862  /* The ICU word iterator considers both the start and the end of a word a valid
863  * break point, but we only want word starts. Move to the next location in
864  * case the new position points to whitespace. */
865  while (pos != icu::BreakIterator::DONE &&
866  IsWhitespace(Utf16DecodeChar((const uint16_t *)&this->utf16_str[pos]))) {
867  int32_t new_pos = this->word_itr->next();
868  /* Don't set it to DONE if it was valid before. Otherwise we'll return END
869  * even though the iterator wasn't at the end of the string before. */
870  if (new_pos == icu::BreakIterator::DONE) break;
871  pos = new_pos;
872  }
873 
874  this->char_itr->isBoundary(pos);
875  break;
876 
877  default:
878  NOT_REACHED();
879  }
880 
881  return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
882  }
883 
884  size_t Prev(IterType what) override
885  {
886  int32_t pos;
887  switch (what) {
888  case ITER_CHARACTER:
889  pos = this->char_itr->previous();
890  break;
891 
892  case ITER_WORD:
893  pos = this->word_itr->preceding(this->char_itr->current());
894  /* The ICU word iterator considers both the start and the end of a word a valid
895  * break point, but we only want word starts. Move to the previous location in
896  * case the new position points to whitespace. */
897  while (pos != icu::BreakIterator::DONE &&
898  IsWhitespace(Utf16DecodeChar((const uint16_t *)&this->utf16_str[pos]))) {
899  int32_t new_pos = this->word_itr->previous();
900  /* Don't set it to DONE if it was valid before. Otherwise we'll return END
901  * even though the iterator wasn't at the start of the string before. */
902  if (new_pos == icu::BreakIterator::DONE) break;
903  pos = new_pos;
904  }
905 
906  this->char_itr->isBoundary(pos);
907  break;
908 
909  default:
910  NOT_REACHED();
911  }
912 
913  return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
914  }
915 };
916 
917 /* static */ std::unique_ptr<StringIterator> StringIterator::Create()
918 {
919  return std::make_unique<IcuStringIterator>();
920 }
921 
922 #else
923 
925 class DefaultStringIterator : public StringIterator
926 {
927  const char *string;
928  size_t len;
929  size_t cur_pos;
930 
931 public:
932  DefaultStringIterator() : string(nullptr), len(0), cur_pos(0)
933  {
934  }
935 
936  void SetString(const char *s) override
937  {
938  this->string = s;
939  this->len = strlen(s);
940  this->cur_pos = 0;
941  }
942 
943  size_t SetCurPosition(size_t pos) override
944  {
945  assert(this->string != nullptr && pos <= this->len);
946  /* Sanitize in case we get a position inside an UTF-8 sequence. */
947  while (pos > 0 && IsUtf8Part(this->string[pos])) pos--;
948  return this->cur_pos = pos;
949  }
950 
951  size_t Next(IterType what) override
952  {
953  assert(this->string != nullptr);
954 
955  /* Already at the end? */
956  if (this->cur_pos >= this->len) return END;
957 
958  switch (what) {
959  case ITER_CHARACTER: {
960  char32_t c;
961  this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
962  return this->cur_pos;
963  }
964 
965  case ITER_WORD: {
966  char32_t c;
967  /* Consume current word. */
968  size_t offs = Utf8Decode(&c, this->string + this->cur_pos);
969  while (this->cur_pos < this->len && !IsWhitespace(c)) {
970  this->cur_pos += offs;
971  offs = Utf8Decode(&c, this->string + this->cur_pos);
972  }
973  /* Consume whitespace to the next word. */
974  while (this->cur_pos < this->len && IsWhitespace(c)) {
975  this->cur_pos += offs;
976  offs = Utf8Decode(&c, this->string + this->cur_pos);
977  }
978 
979  return this->cur_pos;
980  }
981 
982  default:
983  NOT_REACHED();
984  }
985 
986  return END;
987  }
988 
989  size_t Prev(IterType what) override
990  {
991  assert(this->string != nullptr);
992 
993  /* Already at the beginning? */
994  if (this->cur_pos == 0) return END;
995 
996  switch (what) {
997  case ITER_CHARACTER:
998  return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
999 
1000  case ITER_WORD: {
1001  const char *s = this->string + this->cur_pos;
1002  char32_t c;
1003  /* Consume preceding whitespace. */
1004  do {
1005  s = Utf8PrevChar(s);
1006  Utf8Decode(&c, s);
1007  } while (s > this->string && IsWhitespace(c));
1008  /* Consume preceding word. */
1009  while (s > this->string && !IsWhitespace(c)) {
1010  s = Utf8PrevChar(s);
1011  Utf8Decode(&c, s);
1012  }
1013  /* Move caret back to the beginning of the word. */
1014  if (IsWhitespace(c)) Utf8Consume(&s);
1015 
1016  return this->cur_pos = s - this->string;
1017  }
1018 
1019  default:
1020  NOT_REACHED();
1021  }
1022 
1023  return END;
1024  }
1025 };
1026 
1027 #if defined(WITH_COCOA) && !defined(STRGEN) && !defined(SETTINGSGEN)
1028 /* static */ std::unique_ptr<StringIterator> StringIterator::Create()
1029 {
1030  std::unique_ptr<StringIterator> i = OSXStringIterator::Create();
1031  if (i != nullptr) return i;
1032 
1033  return std::make_unique<DefaultStringIterator>();
1034 }
1035 #else
1036 /* static */ std::unique_ptr<StringIterator> StringIterator::Create()
1037 {
1038  return std::make_unique<DefaultStringIterator>();
1039 }
1040 #endif /* defined(WITH_COCOA) && !defined(STRGEN) && !defined(SETTINGSGEN) */
1041 
1042 #endif
Functions related to the allocation of memory.
constexpr debug_inline bool HasBit(const T x, const uint8_t y)
Checks if a bit in a value is set.
constexpr static debug_inline uint GB(const T x, const uint8_t s, const uint8_t n)
Fetch n bits from x, started at bit s.
String iterator using ICU as a backend.
Definition: string.cpp:773
void SetString(const char *s) override
Set a new iteration string.
Definition: string.cpp:797
size_t Prev(IterType what) override
Move the cursor back by one iteration unit.
Definition: string.cpp:884
size_t Next(IterType what) override
Advance the cursor by one iteration unit.
Definition: string.cpp:852
std::vector< size_t > utf16_to_utf8
Mapping from UTF-16 code point position to index in the UTF-8 source string.
Definition: string.cpp:778
size_t SetCurPosition(size_t pos) override
Change the current string cursor.
Definition: string.cpp:834
std::vector< UChar > utf16_str
UTF-16 copy of the string.
Definition: string.cpp:777
icu::BreakIterator * char_itr
ICU iterator for characters.
Definition: string.cpp:774
icu::BreakIterator * word_itr
ICU iterator for words.
Definition: string.cpp:775
Class for iterating over different kind of parts of a string.
Definition: string_base.h:14
static const size_t END
Sentinel to indicate end-of-iteration.
Definition: string_base.h:23
virtual size_t Prev(IterType what=ITER_CHARACTER)=0
Move the cursor back by one iteration unit.
virtual size_t SetCurPosition(size_t pos)=0
Change the current string cursor.
virtual size_t Next(IterType what=ITER_CHARACTER)=0
Advance the cursor by one iteration unit.
IterType
Type of the iterator.
Definition: string_base.h:17
@ ITER_WORD
Iterate over words.
Definition: string_base.h:19
@ ITER_CHARACTER
Iterate over characters (or more exactly grapheme clusters).
Definition: string_base.h:18
static std::unique_ptr< StringIterator > Create()
Create a new iterator instance.
Definition: string.cpp:917
virtual void SetString(const char *s)=0
Set a new iteration string.
Control codes that are embedded in the translation strings.
Functions related to debugging.
#define Debug(category, level, format_string,...)
Ouptut a line of debugging information.
Definition: debug.h:37
Error reporting related functions.
fluid_settings_t * settings
FluidSynth settings handle.
Definition: fluidsynth.cpp:21
Functions related to the gfx engine.
Information about languages and their files.
const LanguageMetadata * _current_language
The currently loaded language.
Definition: strings.cpp:54
std::unique_ptr< icu::Collator > _current_collator
Collator for the language currently in use.
Definition: strings.cpp:59
Integer math functions.
A number of safeguards to prevent using unsafe methods.
Definition of base types and functions in a cross-platform compatible way.
static void StrMakeValid(T &dst, const char *str, const char *last, StringValidationSettings settings)
Copies the valid (UTF-8) characters from str up to last to the dst.
Definition: string.cpp:107
bool ConvertHexToBytes(std::string_view hex, std::span< uint8_t > bytes)
Convert a hex-string to a byte-array, while validating it was actually hex.
Definition: string.cpp:734
bool IsValidChar(char32_t key, CharSetFilter afilter)
Only allow certain keys.
Definition: string.cpp:396
bool StrEqualsIgnoreCase(const std::string_view str1, const std::string_view str2)
Compares two string( view)s for equality, while ignoring the case of the characters.
Definition: string.cpp:347
bool StrNaturalContains(const std::string_view str, const std::string_view value)
Checks if a string is contained in another string with a locale-aware comparison that is case sensiti...
Definition: string.cpp:660
void StrMakeValidInPlace(char *str, const char *last, StringValidationSettings settings)
Scans the string for invalid characters and replaces then with a question mark '?' (if not ignored).
Definition: string.cpp:178
void strecpy(std::span< char > dst, std::string_view src)
Copies characters from one buffer to another.
Definition: string.cpp:60
std::string FormatArrayAsHex(std::span< const uint8_t > data)
Format a byte array into a continuous hex string.
Definition: string.cpp:81
size_t Utf8StringLength(const char *s)
Get the length of an UTF-8 encoded string in number of characters and thus not the number of bytes th...
Definition: string.cpp:359
bool StrStartsWithIgnoreCase(std::string_view str, const std::string_view prefix)
Check whether the given string starts with the given prefix, ignoring case.
Definition: string.cpp:281
bool StrValid(std::span< const char > str)
Checks whether the given string is valid, i.e.
Definition: string.cpp:227
static int ConvertHexNibbleToByte(char c)
Convert a single hex-nibble to a byte.
Definition: string.cpp:715
static int ICUStringContains(const std::string_view str, const std::string_view value, bool case_insensitive)
Search if a string is contained in another string using the current locale.
Definition: string.cpp:630
static std::string_view SkipGarbage(std::string_view str)
Skip some of the 'garbage' in the string that we don't want to use to sort on.
Definition: string.cpp:568
size_t Utf8Decode(char32_t *c, const char *s)
Decode and consume the next UTF-8 encoded character.
Definition: string.cpp:419
int StrNaturalCompare(std::string_view s1, std::string_view s2, bool ignore_garbage_at_front)
Compares two strings using case insensitive natural sort.
Definition: string.cpp:589
bool StrNaturalContainsIgnoreCase(const std::string_view str, const std::string_view value)
Checks if a string is contained in another string with a locale-aware comparison that is case insensi...
Definition: string.cpp:687
std::basic_string_view< char, CaseInsensitiveCharTraits > CaseInsensitiveStringView
Case insensitive string view.
Definition: string.cpp:313
int StrCompareIgnoreCase(const std::string_view str1, const std::string_view str2)
Compares two string( view)s, while ignoring the case of the characters.
Definition: string.cpp:334
bool StrEndsWithIgnoreCase(std::string_view str, const std::string_view suffix)
Check whether the given string ends with the given suffix, ignoring case.
Definition: string.cpp:321
size_t Utf8TrimString(char *s, size_t maxlen)
Properly terminate an UTF8 string to some maximum length.
Definition: string.cpp:508
void StrTrimInPlace(std::string &str)
Trim the spaces from given string in place, i.e.
Definition: string.cpp:260
size_t Utf8Encode(T buf, char32_t c)
Encode a unicode character and place it in the buffer.
Definition: string.cpp:460
static bool IsGarbageCharacter(char32_t c)
Test if a unicode character is considered garbage to be skipped.
Definition: string.cpp:549
Functions related to low-level strings.
char32_t Utf16DecodeChar(const uint16_t *c)
Decode an UTF-16 character.
Definition: string_func.h:202
bool IsWhitespace(char32_t c)
Check whether UNICODE character is whitespace or not, i.e.
Definition: string_func.h:249
int8_t Utf8EncodedCharLen(char c)
Return the length of an UTF-8 encoded value based on a single char.
Definition: string_func.h:124
char * Utf8PrevChar(char *s)
Retrieve the previous UNICODE character in an UTF-8 encoded string.
Definition: string_func.h:149
int MacOSStringCompare(std::string_view s1, std::string_view s2)
Compares two strings using case insensitive natural sort.
Definition: string_osx.cpp:329
int MacOSStringContains(const std::string_view str, const std::string_view value, bool case_insensitive)
Search if a string is contained in another string using the current locale.
Definition: string_osx.cpp:353
Functions related to localized text support on OSX.
CharSetFilter
Valid filter types for IsValidChar.
Definition: string_type.h:24
@ CS_NUMERAL_SPACE
Only numbers and spaces.
Definition: string_type.h:27
@ CS_HEXADECIMAL
Only hexadecimal characters.
Definition: string_type.h:30
@ CS_NUMERAL
Only numeric ones.
Definition: string_type.h:26
@ CS_NUMERAL_SIGNED
Only numbers and '-' for negative values.
Definition: string_type.h:28
@ CS_ALPHA
Only alphabetic values.
Definition: string_type.h:29
@ CS_ALPHANUMERAL
Both numeric and alphabetic and spaces and stuff.
Definition: string_type.h:25
StringValidationSettings
Settings for the string validation.
Definition: string_type.h:44
@ SVS_ALLOW_CONTROL_CODE
Allow the special control codes.
Definition: string_type.h:48
@ SVS_REPLACE_TAB_CR_NL_WITH_SPACE
Replace tabs ('\t'), carriage returns ('\r') and newlines (' ') with spaces.
Definition: string_type.h:54
@ SVS_ALLOW_NEWLINE
Allow newlines; replaces '\r ' with ' ' during processing.
Definition: string_type.h:47
@ SVS_REPLACE_WITH_QUESTION_MARK
Replace the unknown/bad bits with question marks.
Definition: string_type.h:46
Functions related to laying out text on Win32.
Case insensitive implementation of the standard character type traits.
Definition: string.cpp:288
char isocode[16]
the ISO code for the language (not country code)
Definition: language.h:31
int Win32StringContains(const std::string_view str, const std::string_view value, bool case_insensitive)
Search if a string is contained in another string using the current locale.
Definition: win32.cpp:479
declarations of functions for MS windows systems