OpenTTD Source  20240917-master-g9ab0a47812
string.cpp
Go to the documentation of this file.
1 /*
2  * This file is part of OpenTTD.
3  * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
4  * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
5  * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
6  */
7 
10 #include "stdafx.h"
11 #include "debug.h"
12 #include "core/alloc_func.hpp"
13 #include "core/math_func.hpp"
14 #include "error_func.h"
15 #include "string_func.h"
16 #include "string_base.h"
17 
18 #include "table/control_codes.h"
19 
20 #include <sstream>
21 #include <iomanip>
22 
23 #ifdef _MSC_VER
24 # define strncasecmp strnicmp
25 #endif
26 
27 #ifdef _WIN32
28 # include "os/windows/win32.h"
29 #endif
30 
31 #ifdef WITH_UNISCRIBE
33 #endif
34 
35 #ifdef WITH_ICU_I18N
36 /* Required by StrNaturalCompare. */
37 # include <unicode/ustring.h>
38 # include "language.h"
39 # include "gfx_func.h"
40 #endif /* WITH_ICU_I18N */
41 
42 #if defined(WITH_COCOA)
43 # include "os/macosx/string_osx.h"
44 #endif
45 
46 #include "safeguards.h"
47 
48 
60 void strecpy(std::span<char> dst, std::string_view src)
61 {
62  /* Ensure source string fits with NUL terminator; dst must be at least 1 character longer than src. */
63  if (std::empty(dst) || std::size(src) >= std::size(dst) - 1U) {
64 #if defined(STRGEN) || defined(SETTINGSGEN)
65  FatalError("String too long for destination buffer");
66 #else /* STRGEN || SETTINGSGEN */
67  Debug(misc, 0, "String too long for destination buffer");
68  src = src.substr(0, std::size(dst) - 1U);
69 #endif /* STRGEN || SETTINGSGEN */
70  }
71 
72  auto it = std::copy(std::begin(src), std::end(src), std::begin(dst));
73  *it = '\0';
74 }
75 
81 std::string FormatArrayAsHex(std::span<const uint8_t> data)
82 {
83  std::string str;
84  str.reserve(data.size() * 2 + 1);
85 
86  for (auto b : data) {
87  fmt::format_to(std::back_inserter(str), "{:02X}", b);
88  }
89 
90  return str;
91 }
92 
93 
106 template <class T>
107 static void StrMakeValid(T &dst, const char *str, const char *last, StringValidationSettings settings)
108 {
109  /* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
110 
111  while (str <= last && *str != '\0') {
112  size_t len = Utf8EncodedCharLen(*str);
113  char32_t c;
114  /* If the first byte does not look like the first byte of an encoded
115  * character, i.e. encoded length is 0, then this byte is definitely bad
116  * and it should be skipped.
117  * When the first byte looks like the first byte of an encoded character,
118  * then the remaining bytes in the string are checked whether the whole
119  * encoded character can be there. If that is not the case, this byte is
120  * skipped.
121  * Finally we attempt to decode the encoded character, which does certain
122  * extra validations to see whether the correct number of bytes were used
123  * to encode the character. If that is not the case, the byte is probably
124  * invalid and it is skipped. We could emit a question mark, but then the
125  * logic below cannot just copy bytes, it would need to re-encode the
126  * decoded characters as the length in bytes may have changed.
127  *
128  * The goals here is to get as much valid Utf8 encoded characters from the
129  * source string to the destination string.
130  *
131  * Note: a multi-byte encoded termination ('\0') will trigger the encoded
132  * char length and the decoded length to differ, so it will be ignored as
133  * invalid character data. If it were to reach the termination, then we
134  * would also reach the "last" byte of the string and a normal '\0'
135  * termination will be placed after it.
136  */
137  if (len == 0 || str + len > last + 1 || len != Utf8Decode(&c, str)) {
138  /* Maybe the next byte is still a valid character? */
139  str++;
140  continue;
141  }
142 
143  if ((IsPrintable(c) && (c < SCC_SPRITE_START || c > SCC_SPRITE_END)) || ((settings & SVS_ALLOW_CONTROL_CODE) != 0 && c == SCC_ENCODED)) {
144  /* Copy the character back. Even if dst is current the same as str
145  * (i.e. no characters have been changed) this is quicker than
146  * moving the pointers ahead by len */
147  do {
148  *dst++ = *str++;
149  } while (--len != 0);
150  } else if ((settings & SVS_ALLOW_NEWLINE) != 0 && c == '\n') {
151  *dst++ = *str++;
152  } else {
153  if ((settings & SVS_ALLOW_NEWLINE) != 0 && c == '\r' && str[1] == '\n') {
154  str += len;
155  continue;
156  }
157  str += len;
158  if ((settings & SVS_REPLACE_TAB_CR_NL_WITH_SPACE) != 0 && (c == '\r' || c == '\n' || c == '\t')) {
159  /* Replace the tab, carriage return or newline with a space. */
160  *dst++ = ' ';
161  } else if ((settings & SVS_REPLACE_WITH_QUESTION_MARK) != 0) {
162  /* Replace the undesirable character with a question mark */
163  *dst++ = '?';
164  }
165  }
166  }
167 
168  /* String termination, if needed, is left to the caller of this function. */
169 }
170 
178 void StrMakeValidInPlace(char *str, const char *last, StringValidationSettings settings)
179 {
180  char *dst = str;
181  StrMakeValid(dst, str, last, settings);
182  *dst = '\0';
183 }
184 
193 {
194  /* We know it is '\0' terminated. */
195  StrMakeValidInPlace(str, str + strlen(str), settings);
196 }
197 
205 std::string StrMakeValid(std::string_view str, StringValidationSettings settings)
206 {
207  if (str.empty()) return {};
208 
209  auto buf = str.data();
210  auto last = buf + str.size() - 1;
211 
212  std::ostringstream dst;
213  std::ostreambuf_iterator<char> dst_iter(dst);
214  StrMakeValid(dst_iter, buf, last, settings);
215 
216  return dst.str();
217 }
218 
227 bool StrValid(std::span<const char> str)
228 {
229  /* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
230  auto it = std::begin(str);
231  auto last = std::prev(std::end(str));
232 
233  while (it <= last && *it != '\0') {
234  size_t len = Utf8EncodedCharLen(*it);
235  /* Encoded length is 0 if the character isn't known.
236  * The length check is needed to prevent Utf8Decode to read
237  * over the terminating '\0' if that happens to be placed
238  * within the encoding of an UTF8 character. */
239  if (len == 0 || it + len > last) return false;
240 
241  char32_t c;
242  len = Utf8Decode(&c, &*it);
243  if (!IsPrintable(c) || (c >= SCC_SPRITE_START && c <= SCC_SPRITE_END)) {
244  return false;
245  }
246 
247  it += len;
248  }
249 
250  return *it == '\0';
251 }
252 
260 void StrTrimInPlace(std::string &str)
261 {
262  str = StrTrimView(str);
263 }
264 
265 std::string_view StrTrimView(std::string_view str)
266 {
267  size_t first_pos = str.find_first_not_of(' ');
268  if (first_pos == std::string::npos) {
269  return std::string_view{};
270  }
271  size_t last_pos = str.find_last_not_of(' ');
272  return str.substr(first_pos, last_pos - first_pos + 1);
273 }
274 
281 bool StrStartsWithIgnoreCase(std::string_view str, const std::string_view prefix)
282 {
283  if (str.size() < prefix.size()) return false;
284  return StrEqualsIgnoreCase(str.substr(0, prefix.size()), prefix);
285 }
286 
288 struct CaseInsensitiveCharTraits : public std::char_traits<char> {
289  static bool eq(char c1, char c2) { return toupper(c1) == toupper(c2); }
290  static bool ne(char c1, char c2) { return toupper(c1) != toupper(c2); }
291  static bool lt(char c1, char c2) { return toupper(c1) < toupper(c2); }
292 
293  static int compare(const char *s1, const char *s2, size_t n)
294  {
295  while (n-- != 0) {
296  if (toupper(*s1) < toupper(*s2)) return -1;
297  if (toupper(*s1) > toupper(*s2)) return 1;
298  ++s1; ++s2;
299  }
300  return 0;
301  }
302 
303  static const char *find(const char *s, size_t n, char a)
304  {
305  for (; n > 0; --n, ++s) {
306  if (toupper(*s) == toupper(a)) return s;
307  }
308  return nullptr;
309  }
310 };
311 
313 typedef std::basic_string_view<char, CaseInsensitiveCharTraits> CaseInsensitiveStringView;
314 
321 bool StrEndsWithIgnoreCase(std::string_view str, const std::string_view suffix)
322 {
323  if (str.size() < suffix.size()) return false;
324  return StrEqualsIgnoreCase(str.substr(str.size() - suffix.size()), suffix);
325 }
326 
334 int StrCompareIgnoreCase(const std::string_view str1, const std::string_view str2)
335 {
336  CaseInsensitiveStringView ci_str1{ str1.data(), str1.size() };
337  CaseInsensitiveStringView ci_str2{ str2.data(), str2.size() };
338  return ci_str1.compare(ci_str2);
339 }
340 
347 bool StrEqualsIgnoreCase(const std::string_view str1, const std::string_view str2)
348 {
349  if (str1.size() != str2.size()) return false;
350  return StrCompareIgnoreCase(str1, str2) == 0;
351 }
352 
359 size_t Utf8StringLength(const char *s)
360 {
361  size_t len = 0;
362  const char *t = s;
363  while (Utf8Consume(&t) != 0) len++;
364  return len;
365 }
366 
373 size_t Utf8StringLength(const std::string &str)
374 {
375  return Utf8StringLength(str.c_str());
376 }
377 
378 bool strtolower(std::string &str, std::string::size_type offs)
379 {
380  bool changed = false;
381  for (auto ch = str.begin() + offs; ch != str.end(); ++ch) {
382  auto new_ch = static_cast<char>(tolower(static_cast<unsigned char>(*ch)));
383  changed |= new_ch != *ch;
384  *ch = new_ch;
385  }
386  return changed;
387 }
388 
396 bool IsValidChar(char32_t key, CharSetFilter afilter)
397 {
398  switch (afilter) {
399  case CS_ALPHANUMERAL: return IsPrintable(key);
400  case CS_NUMERAL: return (key >= '0' && key <= '9');
401  case CS_NUMERAL_SPACE: return (key >= '0' && key <= '9') || key == ' ';
402  case CS_NUMERAL_SIGNED: return (key >= '0' && key <= '9') || key == '-';
403  case CS_ALPHA: return IsPrintable(key) && !(key >= '0' && key <= '9');
404  case CS_HEXADECIMAL: return (key >= '0' && key <= '9') || (key >= 'a' && key <= 'f') || (key >= 'A' && key <= 'F');
405  default: NOT_REACHED();
406  }
407 }
408 
409 
410 /* UTF-8 handling routines */
411 
412 
419 size_t Utf8Decode(char32_t *c, const char *s)
420 {
421  assert(c != nullptr);
422 
423  if (!HasBit(s[0], 7)) {
424  /* Single byte character: 0xxxxxxx */
425  *c = s[0];
426  return 1;
427  } else if (GB(s[0], 5, 3) == 6) {
428  if (IsUtf8Part(s[1])) {
429  /* Double byte character: 110xxxxx 10xxxxxx */
430  *c = GB(s[0], 0, 5) << 6 | GB(s[1], 0, 6);
431  if (*c >= 0x80) return 2;
432  }
433  } else if (GB(s[0], 4, 4) == 14) {
434  if (IsUtf8Part(s[1]) && IsUtf8Part(s[2])) {
435  /* Triple byte character: 1110xxxx 10xxxxxx 10xxxxxx */
436  *c = GB(s[0], 0, 4) << 12 | GB(s[1], 0, 6) << 6 | GB(s[2], 0, 6);
437  if (*c >= 0x800) return 3;
438  }
439  } else if (GB(s[0], 3, 5) == 30) {
440  if (IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) {
441  /* 4 byte character: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
442  *c = GB(s[0], 0, 3) << 18 | GB(s[1], 0, 6) << 12 | GB(s[2], 0, 6) << 6 | GB(s[3], 0, 6);
443  if (*c >= 0x10000 && *c <= 0x10FFFF) return 4;
444  }
445  }
446 
447  *c = '?';
448  return 1;
449 }
450 
451 
459 template <class T>
460 inline size_t Utf8Encode(T buf, char32_t c)
461 {
462  if (c < 0x80) {
463  *buf = c;
464  return 1;
465  } else if (c < 0x800) {
466  *buf++ = 0xC0 + GB(c, 6, 5);
467  *buf = 0x80 + GB(c, 0, 6);
468  return 2;
469  } else if (c < 0x10000) {
470  *buf++ = 0xE0 + GB(c, 12, 4);
471  *buf++ = 0x80 + GB(c, 6, 6);
472  *buf = 0x80 + GB(c, 0, 6);
473  return 3;
474  } else if (c < 0x110000) {
475  *buf++ = 0xF0 + GB(c, 18, 3);
476  *buf++ = 0x80 + GB(c, 12, 6);
477  *buf++ = 0x80 + GB(c, 6, 6);
478  *buf = 0x80 + GB(c, 0, 6);
479  return 4;
480  }
481 
482  *buf = '?';
483  return 1;
484 }
485 
486 size_t Utf8Encode(char *buf, char32_t c)
487 {
488  return Utf8Encode<char *>(buf, c);
489 }
490 
491 size_t Utf8Encode(std::ostreambuf_iterator<char> &buf, char32_t c)
492 {
493  return Utf8Encode<std::ostreambuf_iterator<char> &>(buf, c);
494 }
495 
496 size_t Utf8Encode(std::back_insert_iterator<std::string> &buf, char32_t c)
497 {
498  return Utf8Encode<std::back_insert_iterator<std::string> &>(buf, c);
499 }
500 
508 size_t Utf8TrimString(char *s, size_t maxlen)
509 {
510  size_t length = 0;
511 
512  for (const char *ptr = strchr(s, '\0'); *s != '\0';) {
513  size_t len = Utf8EncodedCharLen(*s);
514  /* Silently ignore invalid UTF8 sequences, our only concern trimming */
515  if (len == 0) len = 1;
516 
517  /* Take care when a hard cutoff was made for the string and
518  * the last UTF8 sequence is invalid */
519  if (length + len >= maxlen || (s + len > ptr)) break;
520  s += len;
521  length += len;
522  }
523 
524  *s = '\0';
525  return length;
526 }
527 
528 #ifdef DEFINE_STRCASESTR
529 char *strcasestr(const char *haystack, const char *needle)
530 {
531  size_t hay_len = strlen(haystack);
532  size_t needle_len = strlen(needle);
533  while (hay_len >= needle_len) {
534  if (strncasecmp(haystack, needle, needle_len) == 0) return const_cast<char *>(haystack);
535 
536  haystack++;
537  hay_len--;
538  }
539 
540  return nullptr;
541 }
542 #endif /* DEFINE_STRCASESTR */
543 
552 static std::string_view SkipGarbage(std::string_view str)
553 {
554  while (!str.empty() && (str[0] < '0' || IsInsideMM(str[0], ';', '@' + 1) || IsInsideMM(str[0], '[', '`' + 1) || IsInsideMM(str[0], '{', '~' + 1))) str.remove_prefix(1);
555  return str;
556 }
557 
566 int StrNaturalCompare(std::string_view s1, std::string_view s2, bool ignore_garbage_at_front)
567 {
568  if (ignore_garbage_at_front) {
569  s1 = SkipGarbage(s1);
570  s2 = SkipGarbage(s2);
571  }
572 
573 #ifdef WITH_ICU_I18N
574  if (_current_collator) {
575  UErrorCode status = U_ZERO_ERROR;
576  int result = _current_collator->compareUTF8(icu::StringPiece(s1.data(), s1.size()), icu::StringPiece(s2.data(), s2.size()), status);
577  if (U_SUCCESS(status)) return result;
578  }
579 #endif /* WITH_ICU_I18N */
580 
581 #if defined(_WIN32) && !defined(STRGEN) && !defined(SETTINGSGEN)
582  int res = OTTDStringCompare(s1, s2);
583  if (res != 0) return res - 2; // Convert to normal C return values.
584 #endif
585 
586 #if defined(WITH_COCOA) && !defined(STRGEN) && !defined(SETTINGSGEN)
587  int res = MacOSStringCompare(s1, s2);
588  if (res != 0) return res - 2; // Convert to normal C return values.
589 #endif
590 
591  /* Do a normal comparison if ICU is missing or if we cannot create a collator. */
592  return StrCompareIgnoreCase(s1, s2);
593 }
594 
595 #ifdef WITH_ICU_I18N
596 
597 #include <unicode/stsearch.h>
598 
607 static int ICUStringContains(const std::string_view str, const std::string_view value, bool case_insensitive)
608 {
609  if (_current_collator) {
610  std::unique_ptr<icu::RuleBasedCollator> coll(dynamic_cast<icu::RuleBasedCollator *>(_current_collator->clone()));
611  if (coll) {
612  UErrorCode status = U_ZERO_ERROR;
613  coll->setStrength(case_insensitive ? icu::Collator::SECONDARY : icu::Collator::TERTIARY);
614  coll->setAttribute(UCOL_NUMERIC_COLLATION, UCOL_OFF, status);
615 
616  auto u_str = icu::UnicodeString::fromUTF8(icu::StringPiece(str.data(), str.size()));
617  auto u_value = icu::UnicodeString::fromUTF8(icu::StringPiece(value.data(), value.size()));
618  icu::StringSearch u_searcher(u_value, u_str, coll.get(), nullptr, status);
619  if (U_SUCCESS(status)) {
620  auto pos = u_searcher.first(status);
621  if (U_SUCCESS(status)) return pos != USEARCH_DONE ? 1 : 0;
622  }
623  }
624  }
625 
626  return -1;
627 }
628 #endif /* WITH_ICU_I18N */
629 
637 [[nodiscard]] bool StrNaturalContains(const std::string_view str, const std::string_view value)
638 {
639 #ifdef WITH_ICU_I18N
640  int res_u = ICUStringContains(str, value, false);
641  if (res_u >= 0) return res_u > 0;
642 #endif /* WITH_ICU_I18N */
643 
644 #if defined(_WIN32) && !defined(STRGEN) && !defined(SETTINGSGEN)
645  int res = Win32StringContains(str, value, false);
646  if (res >= 0) return res > 0;
647 #endif
648 
649 #if defined(WITH_COCOA) && !defined(STRGEN) && !defined(SETTINGSGEN)
650  int res = MacOSStringContains(str, value, false);
651  if (res >= 0) return res > 0;
652 #endif
653 
654  return str.find(value) != std::string_view::npos;
655 }
656 
664 [[nodiscard]] bool StrNaturalContainsIgnoreCase(const std::string_view str, const std::string_view value)
665 {
666 #ifdef WITH_ICU_I18N
667  int res_u = ICUStringContains(str, value, true);
668  if (res_u >= 0) return res_u > 0;
669 #endif /* WITH_ICU_I18N */
670 
671 #if defined(_WIN32) && !defined(STRGEN) && !defined(SETTINGSGEN)
672  int res = Win32StringContains(str, value, true);
673  if (res >= 0) return res > 0;
674 #endif
675 
676 #if defined(WITH_COCOA) && !defined(STRGEN) && !defined(SETTINGSGEN)
677  int res = MacOSStringContains(str, value, true);
678  if (res >= 0) return res > 0;
679 #endif
680 
681  CaseInsensitiveStringView ci_str{ str.data(), str.size() };
682  CaseInsensitiveStringView ci_value{ value.data(), value.size() };
683  return ci_str.find(ci_value) != CaseInsensitiveStringView::npos;
684 }
685 
692 static int ConvertHexNibbleToByte(char c)
693 {
694  if (c >= '0' && c <= '9') return c - '0';
695  if (c >= 'A' && c <= 'F') return c + 10 - 'A';
696  if (c >= 'a' && c <= 'f') return c + 10 - 'a';
697  return -1;
698 }
699 
711 bool ConvertHexToBytes(std::string_view hex, std::span<uint8_t> bytes)
712 {
713  if (bytes.size() != hex.size() / 2) {
714  return false;
715  }
716 
717  /* Hex-string lengths are always divisible by 2. */
718  if (hex.size() % 2 != 0) {
719  return false;
720  }
721 
722  for (size_t i = 0; i < hex.size() / 2; i++) {
723  auto hi = ConvertHexNibbleToByte(hex[i * 2]);
724  auto lo = ConvertHexNibbleToByte(hex[i * 2 + 1]);
725 
726  if (hi < 0 || lo < 0) {
727  return false;
728  }
729 
730  bytes[i] = (hi << 4) | lo;
731  }
732 
733  return true;
734 }
735 
736 #ifdef WITH_UNISCRIBE
737 
738 /* static */ std::unique_ptr<StringIterator> StringIterator::Create()
739 {
740  return std::make_unique<UniscribeStringIterator>();
741 }
742 
743 #elif defined(WITH_ICU_I18N)
744 
745 #include <unicode/utext.h>
746 #include <unicode/brkiter.h>
747 
750 {
751  icu::BreakIterator *char_itr;
752  icu::BreakIterator *word_itr;
753 
754  std::vector<UChar> utf16_str;
755  std::vector<size_t> utf16_to_utf8;
756 
757 public:
758  IcuStringIterator() : char_itr(nullptr), word_itr(nullptr)
759  {
760  UErrorCode status = U_ZERO_ERROR;
761  this->char_itr = icu::BreakIterator::createCharacterInstance(icu::Locale(_current_language != nullptr ? _current_language->isocode : "en"), status);
762  this->word_itr = icu::BreakIterator::createWordInstance(icu::Locale(_current_language != nullptr ? _current_language->isocode : "en"), status);
763 
764  this->utf16_str.push_back('\0');
765  this->utf16_to_utf8.push_back(0);
766  }
767 
768  ~IcuStringIterator() override
769  {
770  delete this->char_itr;
771  delete this->word_itr;
772  }
773 
774  void SetString(const char *s) override
775  {
776  const char *string_base = s;
777 
778  /* Unfortunately current ICU versions only provide rudimentary support
779  * for word break iterators (especially for CJK languages) in combination
780  * with UTF-8 input. As a work around we have to convert the input to
781  * UTF-16 and create a mapping back to UTF-8 character indices. */
782  this->utf16_str.clear();
783  this->utf16_to_utf8.clear();
784 
785  while (*s != '\0') {
786  size_t idx = s - string_base;
787 
788  char32_t c = Utf8Consume(&s);
789  if (c < 0x10000) {
790  this->utf16_str.push_back((UChar)c);
791  } else {
792  /* Make a surrogate pair. */
793  this->utf16_str.push_back((UChar)(0xD800 + ((c - 0x10000) >> 10)));
794  this->utf16_str.push_back((UChar)(0xDC00 + ((c - 0x10000) & 0x3FF)));
795  this->utf16_to_utf8.push_back(idx);
796  }
797  this->utf16_to_utf8.push_back(idx);
798  }
799  this->utf16_str.push_back('\0');
800  this->utf16_to_utf8.push_back(s - string_base);
801 
802  UText text = UTEXT_INITIALIZER;
803  UErrorCode status = U_ZERO_ERROR;
804  utext_openUChars(&text, this->utf16_str.data(), this->utf16_str.size() - 1, &status);
805  this->char_itr->setText(&text, status);
806  this->word_itr->setText(&text, status);
807  this->char_itr->first();
808  this->word_itr->first();
809  }
810 
811  size_t SetCurPosition(size_t pos) override
812  {
813  /* Convert incoming position to an UTF-16 string index. */
814  uint utf16_pos = 0;
815  for (uint i = 0; i < this->utf16_to_utf8.size(); i++) {
816  if (this->utf16_to_utf8[i] == pos) {
817  utf16_pos = i;
818  break;
819  }
820  }
821 
822  /* isBoundary has the documented side-effect of setting the current
823  * position to the first valid boundary equal to or greater than
824  * the passed value. */
825  this->char_itr->isBoundary(utf16_pos);
826  return this->utf16_to_utf8[this->char_itr->current()];
827  }
828 
829  size_t Next(IterType what) override
830  {
831  int32_t pos;
832  switch (what) {
833  case ITER_CHARACTER:
834  pos = this->char_itr->next();
835  break;
836 
837  case ITER_WORD:
838  pos = this->word_itr->following(this->char_itr->current());
839  /* The ICU word iterator considers both the start and the end of a word a valid
840  * break point, but we only want word starts. Move to the next location in
841  * case the new position points to whitespace. */
842  while (pos != icu::BreakIterator::DONE &&
843  IsWhitespace(Utf16DecodeChar((const uint16_t *)&this->utf16_str[pos]))) {
844  int32_t new_pos = this->word_itr->next();
845  /* Don't set it to DONE if it was valid before. Otherwise we'll return END
846  * even though the iterator wasn't at the end of the string before. */
847  if (new_pos == icu::BreakIterator::DONE) break;
848  pos = new_pos;
849  }
850 
851  this->char_itr->isBoundary(pos);
852  break;
853 
854  default:
855  NOT_REACHED();
856  }
857 
858  return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
859  }
860 
861  size_t Prev(IterType what) override
862  {
863  int32_t pos;
864  switch (what) {
865  case ITER_CHARACTER:
866  pos = this->char_itr->previous();
867  break;
868 
869  case ITER_WORD:
870  pos = this->word_itr->preceding(this->char_itr->current());
871  /* The ICU word iterator considers both the start and the end of a word a valid
872  * break point, but we only want word starts. Move to the previous location in
873  * case the new position points to whitespace. */
874  while (pos != icu::BreakIterator::DONE &&
875  IsWhitespace(Utf16DecodeChar((const uint16_t *)&this->utf16_str[pos]))) {
876  int32_t new_pos = this->word_itr->previous();
877  /* Don't set it to DONE if it was valid before. Otherwise we'll return END
878  * even though the iterator wasn't at the start of the string before. */
879  if (new_pos == icu::BreakIterator::DONE) break;
880  pos = new_pos;
881  }
882 
883  this->char_itr->isBoundary(pos);
884  break;
885 
886  default:
887  NOT_REACHED();
888  }
889 
890  return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
891  }
892 };
893 
894 /* static */ std::unique_ptr<StringIterator> StringIterator::Create()
895 {
896  return std::make_unique<IcuStringIterator>();
897 }
898 
899 #else
900 
902 class DefaultStringIterator : public StringIterator
903 {
904  const char *string;
905  size_t len;
906  size_t cur_pos;
907 
908 public:
909  DefaultStringIterator() : string(nullptr), len(0), cur_pos(0)
910  {
911  }
912 
913  void SetString(const char *s) override
914  {
915  this->string = s;
916  this->len = strlen(s);
917  this->cur_pos = 0;
918  }
919 
920  size_t SetCurPosition(size_t pos) override
921  {
922  assert(this->string != nullptr && pos <= this->len);
923  /* Sanitize in case we get a position inside an UTF-8 sequence. */
924  while (pos > 0 && IsUtf8Part(this->string[pos])) pos--;
925  return this->cur_pos = pos;
926  }
927 
928  size_t Next(IterType what) override
929  {
930  assert(this->string != nullptr);
931 
932  /* Already at the end? */
933  if (this->cur_pos >= this->len) return END;
934 
935  switch (what) {
936  case ITER_CHARACTER: {
937  char32_t c;
938  this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
939  return this->cur_pos;
940  }
941 
942  case ITER_WORD: {
943  char32_t c;
944  /* Consume current word. */
945  size_t offs = Utf8Decode(&c, this->string + this->cur_pos);
946  while (this->cur_pos < this->len && !IsWhitespace(c)) {
947  this->cur_pos += offs;
948  offs = Utf8Decode(&c, this->string + this->cur_pos);
949  }
950  /* Consume whitespace to the next word. */
951  while (this->cur_pos < this->len && IsWhitespace(c)) {
952  this->cur_pos += offs;
953  offs = Utf8Decode(&c, this->string + this->cur_pos);
954  }
955 
956  return this->cur_pos;
957  }
958 
959  default:
960  NOT_REACHED();
961  }
962 
963  return END;
964  }
965 
966  size_t Prev(IterType what) override
967  {
968  assert(this->string != nullptr);
969 
970  /* Already at the beginning? */
971  if (this->cur_pos == 0) return END;
972 
973  switch (what) {
974  case ITER_CHARACTER:
975  return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
976 
977  case ITER_WORD: {
978  const char *s = this->string + this->cur_pos;
979  char32_t c;
980  /* Consume preceding whitespace. */
981  do {
982  s = Utf8PrevChar(s);
983  Utf8Decode(&c, s);
984  } while (s > this->string && IsWhitespace(c));
985  /* Consume preceding word. */
986  while (s > this->string && !IsWhitespace(c)) {
987  s = Utf8PrevChar(s);
988  Utf8Decode(&c, s);
989  }
990  /* Move caret back to the beginning of the word. */
991  if (IsWhitespace(c)) Utf8Consume(&s);
992 
993  return this->cur_pos = s - this->string;
994  }
995 
996  default:
997  NOT_REACHED();
998  }
999 
1000  return END;
1001  }
1002 };
1003 
1004 #if defined(WITH_COCOA) && !defined(STRGEN) && !defined(SETTINGSGEN)
1005 /* static */ std::unique_ptr<StringIterator> StringIterator::Create()
1006 {
1007  std::unique_ptr<StringIterator> i = OSXStringIterator::Create();
1008  if (i != nullptr) return i;
1009 
1010  return std::make_unique<DefaultStringIterator>();
1011 }
1012 #else
1013 /* static */ std::unique_ptr<StringIterator> StringIterator::Create()
1014 {
1015  return std::make_unique<DefaultStringIterator>();
1016 }
1017 #endif /* defined(WITH_COCOA) && !defined(STRGEN) && !defined(SETTINGSGEN) */
1018 
1019 #endif
strecpy
void strecpy(std::span< char > dst, std::string_view src)
Copies characters from one buffer to another.
Definition: string.cpp:60
StrStartsWithIgnoreCase
bool StrStartsWithIgnoreCase(std::string_view str, const std::string_view prefix)
Check whether the given string starts with the given prefix, ignoring case.
Definition: string.cpp:281
StringIterator::Prev
virtual size_t Prev(IterType what=ITER_CHARACTER)=0
Move the cursor back by one iteration unit.
IcuStringIterator::utf16_to_utf8
std::vector< size_t > utf16_to_utf8
Mapping from UTF-16 code point position to index in the UTF-8 source string.
Definition: string.cpp:755
CaseInsensitiveStringView
std::basic_string_view< char, CaseInsensitiveCharTraits > CaseInsensitiveStringView
Case insensitive string view.
Definition: string.cpp:313
SVS_ALLOW_NEWLINE
@ SVS_ALLOW_NEWLINE
Allow newlines; replaces '\r ' with ' ' during processing.
Definition: string_type.h:47
StringIterator::IterType
IterType
Type of the iterator.
Definition: string_base.h:17
IsInsideMM
constexpr bool IsInsideMM(const T x, const size_t min, const size_t max) noexcept
Checks if a value is in an interval.
Definition: math_func.hpp:268
win32.h
Win32StringContains
int Win32StringContains(const std::string_view str, const std::string_view value, bool case_insensitive)
Search if a string is contained in another string using the current locale.
Definition: win32.cpp:485
StringIterator::END
static const size_t END
Sentinel to indicate end-of-iteration.
Definition: string_base.h:23
math_func.hpp
GB
constexpr static debug_inline uint GB(const T x, const uint8_t s, const uint8_t n)
Fetch n bits from x, started at bit s.
Definition: bitmath_func.hpp:32
CaseInsensitiveCharTraits
Case insensitive implementation of the standard character type traits.
Definition: string.cpp:288
Utf8PrevChar
char * Utf8PrevChar(char *s)
Retrieve the previous UNICODE character in an UTF-8 encoded string.
Definition: string_func.h:149
StrMakeValid
static void StrMakeValid(T &dst, const char *str, const char *last, StringValidationSettings settings)
Copies the valid (UTF-8) characters from str up to last to the dst.
Definition: string.cpp:107
Utf16DecodeChar
char32_t Utf16DecodeChar(const uint16_t *c)
Decode an UTF-16 character.
Definition: string_func.h:202
IcuStringIterator::word_itr
icu::BreakIterator * word_itr
ICU iterator for words.
Definition: string.cpp:752
_current_collator
std::unique_ptr< icu::Collator > _current_collator
Collator for the language currently in use.
Definition: strings.cpp:59
StringIterator::Next
virtual size_t Next(IterType what=ITER_CHARACTER)=0
Advance the cursor by one iteration unit.
CS_ALPHA
@ CS_ALPHA
Only alphabetic values.
Definition: string_type.h:29
StrNaturalCompare
int StrNaturalCompare(std::string_view s1, std::string_view s2, bool ignore_garbage_at_front)
Compares two strings using case insensitive natural sort.
Definition: string.cpp:566
StrEndsWithIgnoreCase
bool StrEndsWithIgnoreCase(std::string_view str, const std::string_view suffix)
Check whether the given string ends with the given suffix, ignoring case.
Definition: string.cpp:321
ConvertHexToBytes
bool ConvertHexToBytes(std::string_view hex, std::span< uint8_t > bytes)
Convert a hex-string to a byte-array, while validating it was actually hex.
Definition: string.cpp:711
StringIterator::SetString
virtual void SetString(const char *s)=0
Set a new iteration string.
IcuStringIterator::char_itr
icu::BreakIterator * char_itr
ICU iterator for characters.
Definition: string.cpp:751
StringIterator::ITER_CHARACTER
@ ITER_CHARACTER
Iterate over characters (or more exactly grapheme clusters).
Definition: string_base.h:18
Debug
#define Debug(category, level, format_string,...)
Ouptut a line of debugging information.
Definition: debug.h:37
control_codes.h
IcuStringIterator::Next
size_t Next(IterType what) override
Advance the cursor by one iteration unit.
Definition: string.cpp:829
string_osx.h
gfx_func.h
Utf8StringLength
size_t Utf8StringLength(const char *s)
Get the length of an UTF-8 encoded string in number of characters and thus not the number of bytes th...
Definition: string.cpp:359
SVS_ALLOW_CONTROL_CODE
@ SVS_ALLOW_CONTROL_CODE
Allow the special control codes.
Definition: string_type.h:48
error_func.h
StringIterator
Class for iterating over different kind of parts of a string.
Definition: string_base.h:14
IcuStringIterator::SetCurPosition
size_t SetCurPosition(size_t pos) override
Change the current string cursor.
Definition: string.cpp:811
FormatArrayAsHex
std::string FormatArrayAsHex(std::span< const uint8_t > data)
Format a byte array into a continuous hex string.
Definition: string.cpp:81
StringIterator::SetCurPosition
virtual size_t SetCurPosition(size_t pos)=0
Change the current string cursor.
StrMakeValidInPlace
void StrMakeValidInPlace(char *str, const char *last, StringValidationSettings settings)
Scans the string for invalid characters and replaces then with a question mark '?' (if not ignored).
Definition: string.cpp:178
StrTrimInPlace
void StrTrimInPlace(std::string &str)
Trim the spaces from given string in place, i.e.
Definition: string.cpp:260
ConvertHexNibbleToByte
static int ConvertHexNibbleToByte(char c)
Convert a single hex-nibble to a byte.
Definition: string.cpp:692
MacOSStringContains
int MacOSStringContains(const std::string_view str, const std::string_view value, bool case_insensitive)
Search if a string is contained in another string using the current locale.
Definition: string_osx.cpp:353
MacOSStringCompare
int MacOSStringCompare(std::string_view s1, std::string_view s2)
Compares two strings using case insensitive natural sort.
Definition: string_osx.cpp:329
_current_language
const LanguageMetadata * _current_language
The currently loaded language.
Definition: strings.cpp:54
safeguards.h
IcuStringIterator::utf16_str
std::vector< UChar > utf16_str
UTF-16 copy of the string.
Definition: string.cpp:754
settings
fluid_settings_t * settings
FluidSynth settings handle.
Definition: fluidsynth.cpp:21
StringIterator::Create
static std::unique_ptr< StringIterator > Create()
Create a new iterator instance.
Definition: string.cpp:894
StringIterator::ITER_WORD
@ ITER_WORD
Iterate over words.
Definition: string_base.h:19
language.h
stdafx.h
CS_ALPHANUMERAL
@ CS_ALPHANUMERAL
Both numeric and alphabetic and spaces and stuff.
Definition: string_type.h:25
LanguagePackHeader::isocode
char isocode[16]
the ISO code for the language (not country code)
Definition: language.h:31
StringValidationSettings
StringValidationSettings
Settings for the string validation.
Definition: string_type.h:44
SkipGarbage
static std::string_view SkipGarbage(std::string_view str)
Skip some of the 'garbage' in the string that we don't want to use to sort on.
Definition: string.cpp:552
string_func.h
Utf8EncodedCharLen
int8_t Utf8EncodedCharLen(char c)
Return the length of an UTF-8 encoded value based on a single char.
Definition: string_func.h:124
IsValidChar
bool IsValidChar(char32_t key, CharSetFilter afilter)
Only allow certain keys.
Definition: string.cpp:396
ICUStringContains
static int ICUStringContains(const std::string_view str, const std::string_view value, bool case_insensitive)
Search if a string is contained in another string using the current locale.
Definition: string.cpp:607
alloc_func.hpp
IcuStringIterator::SetString
void SetString(const char *s) override
Set a new iteration string.
Definition: string.cpp:774
IcuStringIterator
String iterator using ICU as a backend.
Definition: string.cpp:749
StrEqualsIgnoreCase
bool StrEqualsIgnoreCase(const std::string_view str1, const std::string_view str2)
Compares two string( view)s for equality, while ignoring the case of the characters.
Definition: string.cpp:347
StrValid
bool StrValid(std::span< const char > str)
Checks whether the given string is valid, i.e.
Definition: string.cpp:227
IsWhitespace
bool IsWhitespace(char32_t c)
Check whether UNICODE character is whitespace or not, i.e.
Definition: string_func.h:249
CS_NUMERAL_SIGNED
@ CS_NUMERAL_SIGNED
Only numbers and '-' for negative values.
Definition: string_type.h:28
Utf8TrimString
size_t Utf8TrimString(char *s, size_t maxlen)
Properly terminate an UTF8 string to some maximum length.
Definition: string.cpp:508
CS_NUMERAL_SPACE
@ CS_NUMERAL_SPACE
Only numbers and spaces.
Definition: string_type.h:27
Utf8Encode
size_t Utf8Encode(T buf, char32_t c)
Encode a unicode character and place it in the buffer.
Definition: string.cpp:460
SVS_REPLACE_TAB_CR_NL_WITH_SPACE
@ SVS_REPLACE_TAB_CR_NL_WITH_SPACE
Replace tabs ('\t'), carriage returns ('\r') and newlines (' ') with spaces.
Definition: string_type.h:54
Utf8Decode
size_t Utf8Decode(char32_t *c, const char *s)
Decode and consume the next UTF-8 encoded character.
Definition: string.cpp:419
CS_HEXADECIMAL
@ CS_HEXADECIMAL
Only hexadecimal characters.
Definition: string_type.h:30
SVS_REPLACE_WITH_QUESTION_MARK
@ SVS_REPLACE_WITH_QUESTION_MARK
Replace the unknown/bad bits with question marks.
Definition: string_type.h:46
StrNaturalContains
bool StrNaturalContains(const std::string_view str, const std::string_view value)
Checks if a string is contained in another string with a locale-aware comparison that is case sensiti...
Definition: string.cpp:637
StrNaturalContainsIgnoreCase
bool StrNaturalContainsIgnoreCase(const std::string_view str, const std::string_view value)
Checks if a string is contained in another string with a locale-aware comparison that is case insensi...
Definition: string.cpp:664
CS_NUMERAL
@ CS_NUMERAL
Only numeric ones.
Definition: string_type.h:26
CharSetFilter
CharSetFilter
Valid filter types for IsValidChar.
Definition: string_type.h:24
StrCompareIgnoreCase
int StrCompareIgnoreCase(const std::string_view str1, const std::string_view str2)
Compares two string( view)s, while ignoring the case of the characters.
Definition: string.cpp:334
debug.h
string_uniscribe.h
IcuStringIterator::Prev
size_t Prev(IterType what) override
Move the cursor back by one iteration unit.
Definition: string.cpp:861
HasBit
constexpr debug_inline bool HasBit(const T x, const uint8_t y)
Checks if a bit in a value is set.
Definition: bitmath_func.hpp:103