OpenTTD Source 20250312-master-gcdcc6b491d
string.cpp
Go to the documentation of this file.
1/*
2 * This file is part of OpenTTD.
3 * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
4 * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
5 * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
6 */
7
10#include "stdafx.h"
11#include "debug.h"
12#include "core/math_func.hpp"
13#include "error_func.h"
14#include "string_func.h"
15#include "string_base.h"
16
17#include "table/control_codes.h"
18
19#include <sstream>
20#include <iomanip>
21
22#ifdef _MSC_VER
23# define strncasecmp strnicmp
24#endif
25
26#ifdef _WIN32
27# include "os/windows/win32.h"
28#endif
29
30#ifdef WITH_UNISCRIBE
32#endif
33
34#ifdef WITH_ICU_I18N
35/* Required by StrNaturalCompare. */
36# include <unicode/ustring.h>
37# include "language.h"
38# include "gfx_func.h"
39#endif /* WITH_ICU_I18N */
40
41#if defined(WITH_COCOA)
42# include "os/macosx/string_osx.h"
43#endif
44
45#include "safeguards.h"
46
47
59void strecpy(std::span<char> dst, std::string_view src)
60{
61 /* Ensure source string fits with NUL terminator; dst must be at least 1 character longer than src. */
62 if (std::empty(dst) || std::size(src) >= std::size(dst) - 1U) {
63#if defined(STRGEN) || defined(SETTINGSGEN)
64 FatalError("String too long for destination buffer");
65#else /* STRGEN || SETTINGSGEN */
66 Debug(misc, 0, "String too long for destination buffer");
67 src = src.substr(0, std::size(dst) - 1U);
68#endif /* STRGEN || SETTINGSGEN */
69 }
70
71 auto it = std::copy(std::begin(src), std::end(src), std::begin(dst));
72 *it = '\0';
73}
74
80std::string FormatArrayAsHex(std::span<const uint8_t> data)
81{
82 std::string str;
83 str.reserve(data.size() * 2 + 1);
84
85 for (auto b : data) {
86 fmt::format_to(std::back_inserter(str), "{:02X}", b);
87 }
88
89 return str;
90}
91
97static bool IsSccEncodedCode(char32_t c)
98{
99 switch (c) {
100 case SCC_RECORD_SEPARATOR:
101 case SCC_ENCODED:
105 return true;
106
107 default:
108 return false;
109 }
110}
111
124template <class T>
125static void StrMakeValid(T &dst, const char *str, const char *last, StringValidationSettings settings)
126{
127 /* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
128
129 while (str <= last && *str != '\0') {
130 size_t len = Utf8EncodedCharLen(*str);
131 char32_t c;
132 /* If the first byte does not look like the first byte of an encoded
133 * character, i.e. encoded length is 0, then this byte is definitely bad
134 * and it should be skipped.
135 * When the first byte looks like the first byte of an encoded character,
136 * then the remaining bytes in the string are checked whether the whole
137 * encoded character can be there. If that is not the case, this byte is
138 * skipped.
139 * Finally we attempt to decode the encoded character, which does certain
140 * extra validations to see whether the correct number of bytes were used
141 * to encode the character. If that is not the case, the byte is probably
142 * invalid and it is skipped. We could emit a question mark, but then the
143 * logic below cannot just copy bytes, it would need to re-encode the
144 * decoded characters as the length in bytes may have changed.
145 *
146 * The goals here is to get as much valid Utf8 encoded characters from the
147 * source string to the destination string.
148 *
149 * Note: a multi-byte encoded termination ('\0') will trigger the encoded
150 * char length and the decoded length to differ, so it will be ignored as
151 * invalid character data. If it were to reach the termination, then we
152 * would also reach the "last" byte of the string and a normal '\0'
153 * termination will be placed after it.
154 */
155 if (len == 0 || str + len > last + 1 || len != Utf8Decode(&c, str)) {
156 /* Maybe the next byte is still a valid character? */
157 str++;
158 continue;
159 }
160
161 if ((IsPrintable(c) && (c < SCC_SPRITE_START || c > SCC_SPRITE_END)) || ((settings & SVS_ALLOW_CONTROL_CODE) != 0 && IsSccEncodedCode(c))) {
162 /* Copy the character back. Even if dst is current the same as str
163 * (i.e. no characters have been changed) this is quicker than
164 * moving the pointers ahead by len */
165 do {
166 *dst++ = *str++;
167 } while (--len != 0);
168 } else if ((settings & SVS_ALLOW_NEWLINE) != 0 && c == '\n') {
169 *dst++ = *str++;
170 } else {
171 if ((settings & SVS_ALLOW_NEWLINE) != 0 && c == '\r' && str[1] == '\n') {
172 str += len;
173 continue;
174 }
175 str += len;
176 if ((settings & SVS_REPLACE_TAB_CR_NL_WITH_SPACE) != 0 && (c == '\r' || c == '\n' || c == '\t')) {
177 /* Replace the tab, carriage return or newline with a space. */
178 *dst++ = ' ';
179 } else if ((settings & SVS_REPLACE_WITH_QUESTION_MARK) != 0) {
180 /* Replace the undesirable character with a question mark */
181 *dst++ = '?';
182 }
183 }
184 }
185
186 /* String termination, if needed, is left to the caller of this function. */
187}
188
196void StrMakeValidInPlace(char *str, const char *last, StringValidationSettings settings)
197{
198 char *dst = str;
199 StrMakeValid(dst, str, last, settings);
200 *dst = '\0';
201}
202
211{
212 /* We know it is '\0' terminated. */
213 StrMakeValidInPlace(str, str + strlen(str), settings);
214}
215
223std::string StrMakeValid(std::string_view str, StringValidationSettings settings)
224{
225 if (str.empty()) return {};
226
227 auto buf = str.data();
228 auto last = buf + str.size() - 1;
229
230 std::ostringstream dst;
231 std::ostreambuf_iterator<char> dst_iter(dst);
232 StrMakeValid(dst_iter, buf, last, settings);
233
234 return dst.str();
235}
236
245bool StrValid(std::span<const char> str)
246{
247 /* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
248 auto it = std::begin(str);
249 auto last = std::prev(std::end(str));
250
251 while (it <= last && *it != '\0') {
252 size_t len = Utf8EncodedCharLen(*it);
253 /* Encoded length is 0 if the character isn't known.
254 * The length check is needed to prevent Utf8Decode to read
255 * over the terminating '\0' if that happens to be placed
256 * within the encoding of an UTF8 character. */
257 if (len == 0 || it + len > last) return false;
258
259 char32_t c;
260 len = Utf8Decode(&c, &*it);
261 if (!IsPrintable(c) || (c >= SCC_SPRITE_START && c <= SCC_SPRITE_END)) {
262 return false;
263 }
264
265 it += len;
266 }
267
268 return *it == '\0';
269}
270
278void StrTrimInPlace(std::string &str)
279{
280 str = StrTrimView(str);
281}
282
283std::string_view StrTrimView(std::string_view str)
284{
285 size_t first_pos = str.find_first_not_of(' ');
286 if (first_pos == std::string::npos) {
287 return std::string_view{};
288 }
289 size_t last_pos = str.find_last_not_of(' ');
290 return str.substr(first_pos, last_pos - first_pos + 1);
291}
292
299bool StrStartsWithIgnoreCase(std::string_view str, const std::string_view prefix)
300{
301 if (str.size() < prefix.size()) return false;
302 return StrEqualsIgnoreCase(str.substr(0, prefix.size()), prefix);
303}
304
306struct CaseInsensitiveCharTraits : public std::char_traits<char> {
307 static bool eq(char c1, char c2) { return toupper(c1) == toupper(c2); }
308 static bool ne(char c1, char c2) { return toupper(c1) != toupper(c2); }
309 static bool lt(char c1, char c2) { return toupper(c1) < toupper(c2); }
310
311 static int compare(const char *s1, const char *s2, size_t n)
312 {
313 while (n-- != 0) {
314 if (toupper(*s1) < toupper(*s2)) return -1;
315 if (toupper(*s1) > toupper(*s2)) return 1;
316 ++s1; ++s2;
317 }
318 return 0;
319 }
320
321 static const char *find(const char *s, size_t n, char a)
322 {
323 for (; n > 0; --n, ++s) {
324 if (toupper(*s) == toupper(a)) return s;
325 }
326 return nullptr;
327 }
328};
329
331typedef std::basic_string_view<char, CaseInsensitiveCharTraits> CaseInsensitiveStringView;
332
339bool StrEndsWithIgnoreCase(std::string_view str, const std::string_view suffix)
340{
341 if (str.size() < suffix.size()) return false;
342 return StrEqualsIgnoreCase(str.substr(str.size() - suffix.size()), suffix);
343}
344
352int StrCompareIgnoreCase(const std::string_view str1, const std::string_view str2)
353{
354 CaseInsensitiveStringView ci_str1{ str1.data(), str1.size() };
355 CaseInsensitiveStringView ci_str2{ str2.data(), str2.size() };
356 return ci_str1.compare(ci_str2);
357}
358
365bool StrEqualsIgnoreCase(const std::string_view str1, const std::string_view str2)
366{
367 if (str1.size() != str2.size()) return false;
368 return StrCompareIgnoreCase(str1, str2) == 0;
369}
370
377size_t Utf8StringLength(const char *s)
378{
379 size_t len = 0;
380 const char *t = s;
381 while (Utf8Consume(&t) != 0) len++;
382 return len;
383}
384
391size_t Utf8StringLength(const std::string &str)
392{
393 return Utf8StringLength(str.c_str());
394}
395
396bool strtolower(std::string &str, std::string::size_type offs)
397{
398 bool changed = false;
399 for (auto ch = str.begin() + offs; ch != str.end(); ++ch) {
400 auto new_ch = static_cast<char>(tolower(static_cast<unsigned char>(*ch)));
401 changed |= new_ch != *ch;
402 *ch = new_ch;
403 }
404 return changed;
405}
406
414bool IsValidChar(char32_t key, CharSetFilter afilter)
415{
416 switch (afilter) {
417 case CS_ALPHANUMERAL: return IsPrintable(key);
418 case CS_NUMERAL: return (key >= '0' && key <= '9');
419 case CS_NUMERAL_SPACE: return (key >= '0' && key <= '9') || key == ' ';
420 case CS_NUMERAL_SIGNED: return (key >= '0' && key <= '9') || key == '-';
421 case CS_ALPHA: return IsPrintable(key) && !(key >= '0' && key <= '9');
422 case CS_HEXADECIMAL: return (key >= '0' && key <= '9') || (key >= 'a' && key <= 'f') || (key >= 'A' && key <= 'F');
423 default: NOT_REACHED();
424 }
425}
426
427
428/* UTF-8 handling routines */
429
430
437size_t Utf8Decode(char32_t *c, const char *s)
438{
439 assert(c != nullptr);
440
441 if (!HasBit(s[0], 7)) {
442 /* Single byte character: 0xxxxxxx */
443 *c = s[0];
444 return 1;
445 } else if (GB(s[0], 5, 3) == 6) {
446 if (IsUtf8Part(s[1])) {
447 /* Double byte character: 110xxxxx 10xxxxxx */
448 *c = GB(s[0], 0, 5) << 6 | GB(s[1], 0, 6);
449 if (*c >= 0x80) return 2;
450 }
451 } else if (GB(s[0], 4, 4) == 14) {
452 if (IsUtf8Part(s[1]) && IsUtf8Part(s[2])) {
453 /* Triple byte character: 1110xxxx 10xxxxxx 10xxxxxx */
454 *c = GB(s[0], 0, 4) << 12 | GB(s[1], 0, 6) << 6 | GB(s[2], 0, 6);
455 if (*c >= 0x800) return 3;
456 }
457 } else if (GB(s[0], 3, 5) == 30) {
458 if (IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) {
459 /* 4 byte character: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
460 *c = GB(s[0], 0, 3) << 18 | GB(s[1], 0, 6) << 12 | GB(s[2], 0, 6) << 6 | GB(s[3], 0, 6);
461 if (*c >= 0x10000 && *c <= 0x10FFFF) return 4;
462 }
463 }
464
465 *c = '?';
466 return 1;
467}
468
469
477template <class T>
478inline size_t Utf8Encode(T buf, char32_t c)
479{
480 if (c < 0x80) {
481 *buf = c;
482 return 1;
483 } else if (c < 0x800) {
484 *buf++ = 0xC0 + GB(c, 6, 5);
485 *buf = 0x80 + GB(c, 0, 6);
486 return 2;
487 } else if (c < 0x10000) {
488 *buf++ = 0xE0 + GB(c, 12, 4);
489 *buf++ = 0x80 + GB(c, 6, 6);
490 *buf = 0x80 + GB(c, 0, 6);
491 return 3;
492 } else if (c < 0x110000) {
493 *buf++ = 0xF0 + GB(c, 18, 3);
494 *buf++ = 0x80 + GB(c, 12, 6);
495 *buf++ = 0x80 + GB(c, 6, 6);
496 *buf = 0x80 + GB(c, 0, 6);
497 return 4;
498 }
499
500 *buf = '?';
501 return 1;
502}
503
504size_t Utf8Encode(char *buf, char32_t c)
505{
506 return Utf8Encode<char *>(buf, c);
507}
508
509size_t Utf8Encode(std::ostreambuf_iterator<char> &buf, char32_t c)
510{
511 return Utf8Encode<std::ostreambuf_iterator<char> &>(buf, c);
512}
513
514size_t Utf8Encode(std::back_insert_iterator<std::string> &buf, char32_t c)
515{
516 return Utf8Encode<std::back_insert_iterator<std::string> &>(buf, c);
517}
518
526size_t Utf8TrimString(char *s, size_t maxlen)
527{
528 size_t length = 0;
529
530 for (const char *ptr = strchr(s, '\0'); *s != '\0';) {
531 size_t len = Utf8EncodedCharLen(*s);
532 /* Silently ignore invalid UTF8 sequences, our only concern trimming */
533 if (len == 0) len = 1;
534
535 /* Take care when a hard cutoff was made for the string and
536 * the last UTF8 sequence is invalid */
537 if (length + len >= maxlen || (s + len > ptr)) break;
538 s += len;
539 length += len;
540 }
541
542 *s = '\0';
543 return length;
544}
545
546#ifdef DEFINE_STRCASESTR
547char *strcasestr(const char *haystack, const char *needle)
548{
549 size_t hay_len = strlen(haystack);
550 size_t needle_len = strlen(needle);
551 while (hay_len >= needle_len) {
552 if (strncasecmp(haystack, needle, needle_len) == 0) return const_cast<char *>(haystack);
553
554 haystack++;
555 hay_len--;
556 }
557
558 return nullptr;
559}
560#endif /* DEFINE_STRCASESTR */
561
567static bool IsGarbageCharacter(char32_t c)
568{
569 if (c >= '0' && c <= '9') return false;
570 if (c >= 'A' && c <= 'Z') return false;
571 if (c >= 'a' && c <= 'z') return false;
572 if (c >= SCC_CONTROL_START && c <= SCC_CONTROL_END) return true;
573 if (c >= 0xC0 && c <= 0x10FFFF) return false;
574
575 return true;
576}
577
586static std::string_view SkipGarbage(std::string_view str)
587{
588 auto first = std::begin(str);
589 auto last = std::end(str);
590 while (first < last) {
591 char32_t c;
592 size_t len = Utf8Decode(&c, &*first);
593 if (!IsGarbageCharacter(c)) break;
594 first += len;
595 }
596 return {first, last};
597}
598
607int StrNaturalCompare(std::string_view s1, std::string_view s2, bool ignore_garbage_at_front)
608{
609 if (ignore_garbage_at_front) {
610 s1 = SkipGarbage(s1);
611 s2 = SkipGarbage(s2);
612 }
613
614#ifdef WITH_ICU_I18N
615 if (_current_collator) {
616 UErrorCode status = U_ZERO_ERROR;
617 int result = _current_collator->compareUTF8(icu::StringPiece(s1.data(), s1.size()), icu::StringPiece(s2.data(), s2.size()), status);
618 if (U_SUCCESS(status)) return result;
619 }
620#endif /* WITH_ICU_I18N */
621
622#if defined(_WIN32) && !defined(STRGEN) && !defined(SETTINGSGEN)
623 int res = OTTDStringCompare(s1, s2);
624 if (res != 0) return res - 2; // Convert to normal C return values.
625#endif
626
627#if defined(WITH_COCOA) && !defined(STRGEN) && !defined(SETTINGSGEN)
628 int res = MacOSStringCompare(s1, s2);
629 if (res != 0) return res - 2; // Convert to normal C return values.
630#endif
631
632 /* Do a normal comparison if ICU is missing or if we cannot create a collator. */
633 return StrCompareIgnoreCase(s1, s2);
634}
635
636#ifdef WITH_ICU_I18N
637
638#include <unicode/stsearch.h>
639
648static int ICUStringContains(const std::string_view str, const std::string_view value, bool case_insensitive)
649{
650 if (_current_collator) {
651 std::unique_ptr<icu::RuleBasedCollator> coll(dynamic_cast<icu::RuleBasedCollator *>(_current_collator->clone()));
652 if (coll) {
653 UErrorCode status = U_ZERO_ERROR;
654 coll->setStrength(case_insensitive ? icu::Collator::SECONDARY : icu::Collator::TERTIARY);
655 coll->setAttribute(UCOL_NUMERIC_COLLATION, UCOL_OFF, status);
656
657 auto u_str = icu::UnicodeString::fromUTF8(icu::StringPiece(str.data(), str.size()));
658 auto u_value = icu::UnicodeString::fromUTF8(icu::StringPiece(value.data(), value.size()));
659 icu::StringSearch u_searcher(u_value, u_str, coll.get(), nullptr, status);
660 if (U_SUCCESS(status)) {
661 auto pos = u_searcher.first(status);
662 if (U_SUCCESS(status)) return pos != USEARCH_DONE ? 1 : 0;
663 }
664 }
665 }
666
667 return -1;
668}
669#endif /* WITH_ICU_I18N */
670
678[[nodiscard]] bool StrNaturalContains(const std::string_view str, const std::string_view value)
679{
680#ifdef WITH_ICU_I18N
681 int res_u = ICUStringContains(str, value, false);
682 if (res_u >= 0) return res_u > 0;
683#endif /* WITH_ICU_I18N */
684
685#if defined(_WIN32) && !defined(STRGEN) && !defined(SETTINGSGEN)
686 int res = Win32StringContains(str, value, false);
687 if (res >= 0) return res > 0;
688#endif
689
690#if defined(WITH_COCOA) && !defined(STRGEN) && !defined(SETTINGSGEN)
691 int res = MacOSStringContains(str, value, false);
692 if (res >= 0) return res > 0;
693#endif
694
695 return str.find(value) != std::string_view::npos;
696}
697
705[[nodiscard]] bool StrNaturalContainsIgnoreCase(const std::string_view str, const std::string_view value)
706{
707#ifdef WITH_ICU_I18N
708 int res_u = ICUStringContains(str, value, true);
709 if (res_u >= 0) return res_u > 0;
710#endif /* WITH_ICU_I18N */
711
712#if defined(_WIN32) && !defined(STRGEN) && !defined(SETTINGSGEN)
713 int res = Win32StringContains(str, value, true);
714 if (res >= 0) return res > 0;
715#endif
716
717#if defined(WITH_COCOA) && !defined(STRGEN) && !defined(SETTINGSGEN)
718 int res = MacOSStringContains(str, value, true);
719 if (res >= 0) return res > 0;
720#endif
721
722 CaseInsensitiveStringView ci_str{ str.data(), str.size() };
723 CaseInsensitiveStringView ci_value{ value.data(), value.size() };
724 return ci_str.find(ci_value) != CaseInsensitiveStringView::npos;
725}
726
733static int ConvertHexNibbleToByte(char c)
734{
735 if (c >= '0' && c <= '9') return c - '0';
736 if (c >= 'A' && c <= 'F') return c + 10 - 'A';
737 if (c >= 'a' && c <= 'f') return c + 10 - 'a';
738 return -1;
739}
740
752bool ConvertHexToBytes(std::string_view hex, std::span<uint8_t> bytes)
753{
754 if (bytes.size() != hex.size() / 2) {
755 return false;
756 }
757
758 /* Hex-string lengths are always divisible by 2. */
759 if (hex.size() % 2 != 0) {
760 return false;
761 }
762
763 for (size_t i = 0; i < hex.size() / 2; i++) {
764 auto hi = ConvertHexNibbleToByte(hex[i * 2]);
765 auto lo = ConvertHexNibbleToByte(hex[i * 2 + 1]);
766
767 if (hi < 0 || lo < 0) {
768 return false;
769 }
770
771 bytes[i] = (hi << 4) | lo;
772 }
773
774 return true;
775}
776
777#ifdef WITH_UNISCRIBE
778
779/* static */ std::unique_ptr<StringIterator> StringIterator::Create()
780{
781 return std::make_unique<UniscribeStringIterator>();
782}
783
784#elif defined(WITH_ICU_I18N)
785
786#include <unicode/utext.h>
787#include <unicode/brkiter.h>
788
791{
792 icu::BreakIterator *char_itr;
793 icu::BreakIterator *word_itr;
794
795 std::vector<UChar> utf16_str;
796 std::vector<size_t> utf16_to_utf8;
797
798public:
799 IcuStringIterator() : char_itr(nullptr), word_itr(nullptr)
800 {
801 UErrorCode status = U_ZERO_ERROR;
802 this->char_itr = icu::BreakIterator::createCharacterInstance(icu::Locale(_current_language != nullptr ? _current_language->isocode : "en"), status);
803 this->word_itr = icu::BreakIterator::createWordInstance(icu::Locale(_current_language != nullptr ? _current_language->isocode : "en"), status);
804
805 this->utf16_str.push_back('\0');
806 this->utf16_to_utf8.push_back(0);
807 }
808
809 ~IcuStringIterator() override
810 {
811 delete this->char_itr;
812 delete this->word_itr;
813 }
814
815 void SetString(const char *s) override
816 {
817 const char *string_base = s;
818
819 /* Unfortunately current ICU versions only provide rudimentary support
820 * for word break iterators (especially for CJK languages) in combination
821 * with UTF-8 input. As a work around we have to convert the input to
822 * UTF-16 and create a mapping back to UTF-8 character indices. */
823 this->utf16_str.clear();
824 this->utf16_to_utf8.clear();
825
826 while (*s != '\0') {
827 size_t idx = s - string_base;
828
829 char32_t c = Utf8Consume(&s);
830 if (c < 0x10000) {
831 this->utf16_str.push_back((UChar)c);
832 } else {
833 /* Make a surrogate pair. */
834 this->utf16_str.push_back((UChar)(0xD800 + ((c - 0x10000) >> 10)));
835 this->utf16_str.push_back((UChar)(0xDC00 + ((c - 0x10000) & 0x3FF)));
836 this->utf16_to_utf8.push_back(idx);
837 }
838 this->utf16_to_utf8.push_back(idx);
839 }
840 this->utf16_str.push_back('\0');
841 this->utf16_to_utf8.push_back(s - string_base);
842
843 UText text = UTEXT_INITIALIZER;
844 UErrorCode status = U_ZERO_ERROR;
845 utext_openUChars(&text, this->utf16_str.data(), this->utf16_str.size() - 1, &status);
846 this->char_itr->setText(&text, status);
847 this->word_itr->setText(&text, status);
848 this->char_itr->first();
849 this->word_itr->first();
850 }
851
852 size_t SetCurPosition(size_t pos) override
853 {
854 /* Convert incoming position to an UTF-16 string index. */
855 uint utf16_pos = 0;
856 for (uint i = 0; i < this->utf16_to_utf8.size(); i++) {
857 if (this->utf16_to_utf8[i] == pos) {
858 utf16_pos = i;
859 break;
860 }
861 }
862
863 /* isBoundary has the documented side-effect of setting the current
864 * position to the first valid boundary equal to or greater than
865 * the passed value. */
866 this->char_itr->isBoundary(utf16_pos);
867 return this->utf16_to_utf8[this->char_itr->current()];
868 }
869
870 size_t Next(IterType what) override
871 {
872 int32_t pos;
873 switch (what) {
874 case ITER_CHARACTER:
875 pos = this->char_itr->next();
876 break;
877
878 case ITER_WORD:
879 pos = this->word_itr->following(this->char_itr->current());
880 /* The ICU word iterator considers both the start and the end of a word a valid
881 * break point, but we only want word starts. Move to the next location in
882 * case the new position points to whitespace. */
883 while (pos != icu::BreakIterator::DONE &&
884 IsWhitespace(Utf16DecodeChar((const uint16_t *)&this->utf16_str[pos]))) {
885 int32_t new_pos = this->word_itr->next();
886 /* Don't set it to DONE if it was valid before. Otherwise we'll return END
887 * even though the iterator wasn't at the end of the string before. */
888 if (new_pos == icu::BreakIterator::DONE) break;
889 pos = new_pos;
890 }
891
892 this->char_itr->isBoundary(pos);
893 break;
894
895 default:
896 NOT_REACHED();
897 }
898
899 return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
900 }
901
902 size_t Prev(IterType what) override
903 {
904 int32_t pos;
905 switch (what) {
906 case ITER_CHARACTER:
907 pos = this->char_itr->previous();
908 break;
909
910 case ITER_WORD:
911 pos = this->word_itr->preceding(this->char_itr->current());
912 /* The ICU word iterator considers both the start and the end of a word a valid
913 * break point, but we only want word starts. Move to the previous location in
914 * case the new position points to whitespace. */
915 while (pos != icu::BreakIterator::DONE &&
916 IsWhitespace(Utf16DecodeChar((const uint16_t *)&this->utf16_str[pos]))) {
917 int32_t new_pos = this->word_itr->previous();
918 /* Don't set it to DONE if it was valid before. Otherwise we'll return END
919 * even though the iterator wasn't at the start of the string before. */
920 if (new_pos == icu::BreakIterator::DONE) break;
921 pos = new_pos;
922 }
923
924 this->char_itr->isBoundary(pos);
925 break;
926
927 default:
928 NOT_REACHED();
929 }
930
931 return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
932 }
933};
934
935/* static */ std::unique_ptr<StringIterator> StringIterator::Create()
936{
937 return std::make_unique<IcuStringIterator>();
938}
939
940#else
941
943class DefaultStringIterator : public StringIterator
944{
945 const char *string;
946 size_t len;
947 size_t cur_pos;
948
949public:
950 DefaultStringIterator() : string(nullptr), len(0), cur_pos(0)
951 {
952 }
953
954 void SetString(const char *s) override
955 {
956 this->string = s;
957 this->len = strlen(s);
958 this->cur_pos = 0;
959 }
960
961 size_t SetCurPosition(size_t pos) override
962 {
963 assert(this->string != nullptr && pos <= this->len);
964 /* Sanitize in case we get a position inside an UTF-8 sequence. */
965 while (pos > 0 && IsUtf8Part(this->string[pos])) pos--;
966 return this->cur_pos = pos;
967 }
968
969 size_t Next(IterType what) override
970 {
971 assert(this->string != nullptr);
972
973 /* Already at the end? */
974 if (this->cur_pos >= this->len) return END;
975
976 switch (what) {
977 case ITER_CHARACTER: {
978 char32_t c;
979 this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
980 return this->cur_pos;
981 }
982
983 case ITER_WORD: {
984 char32_t c;
985 /* Consume current word. */
986 size_t offs = Utf8Decode(&c, this->string + this->cur_pos);
987 while (this->cur_pos < this->len && !IsWhitespace(c)) {
988 this->cur_pos += offs;
989 offs = Utf8Decode(&c, this->string + this->cur_pos);
990 }
991 /* Consume whitespace to the next word. */
992 while (this->cur_pos < this->len && IsWhitespace(c)) {
993 this->cur_pos += offs;
994 offs = Utf8Decode(&c, this->string + this->cur_pos);
995 }
996
997 return this->cur_pos;
998 }
999
1000 default:
1001 NOT_REACHED();
1002 }
1003
1004 return END;
1005 }
1006
1007 size_t Prev(IterType what) override
1008 {
1009 assert(this->string != nullptr);
1010
1011 /* Already at the beginning? */
1012 if (this->cur_pos == 0) return END;
1013
1014 switch (what) {
1015 case ITER_CHARACTER:
1016 return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
1017
1018 case ITER_WORD: {
1019 const char *s = this->string + this->cur_pos;
1020 char32_t c;
1021 /* Consume preceding whitespace. */
1022 do {
1023 s = Utf8PrevChar(s);
1024 Utf8Decode(&c, s);
1025 } while (s > this->string && IsWhitespace(c));
1026 /* Consume preceding word. */
1027 while (s > this->string && !IsWhitespace(c)) {
1028 s = Utf8PrevChar(s);
1029 Utf8Decode(&c, s);
1030 }
1031 /* Move caret back to the beginning of the word. */
1032 if (IsWhitespace(c)) Utf8Consume(&s);
1033
1034 return this->cur_pos = s - this->string;
1035 }
1036
1037 default:
1038 NOT_REACHED();
1039 }
1040
1041 return END;
1042 }
1043};
1044
1045#if defined(WITH_COCOA) && !defined(STRGEN) && !defined(SETTINGSGEN)
1046/* static */ std::unique_ptr<StringIterator> StringIterator::Create()
1047{
1048 std::unique_ptr<StringIterator> i = OSXStringIterator::Create();
1049 if (i != nullptr) return i;
1050
1051 return std::make_unique<DefaultStringIterator>();
1052}
1053#else
1054/* static */ std::unique_ptr<StringIterator> StringIterator::Create()
1055{
1056 return std::make_unique<DefaultStringIterator>();
1057}
1058#endif /* defined(WITH_COCOA) && !defined(STRGEN) && !defined(SETTINGSGEN) */
1059
1060#endif
debug_inline constexpr bool HasBit(const T x, const uint8_t y)
Checks if a bit in a value is set.
debug_inline static constexpr uint GB(const T x, const uint8_t s, const uint8_t n)
Fetch n bits from x, started at bit s.
String iterator using ICU as a backend.
Definition string.cpp:791
void SetString(const char *s) override
Set a new iteration string.
Definition string.cpp:815
size_t Prev(IterType what) override
Move the cursor back by one iteration unit.
Definition string.cpp:902
size_t Next(IterType what) override
Advance the cursor by one iteration unit.
Definition string.cpp:870
std::vector< size_t > utf16_to_utf8
Mapping from UTF-16 code point position to index in the UTF-8 source string.
Definition string.cpp:796
size_t SetCurPosition(size_t pos) override
Change the current string cursor.
Definition string.cpp:852
std::vector< UChar > utf16_str
UTF-16 copy of the string.
Definition string.cpp:795
icu::BreakIterator * char_itr
ICU iterator for characters.
Definition string.cpp:792
icu::BreakIterator * word_itr
ICU iterator for words.
Definition string.cpp:793
Class for iterating over different kind of parts of a string.
Definition string_base.h:14
static const size_t END
Sentinel to indicate end-of-iteration.
Definition string_base.h:23
virtual size_t Prev(IterType what=ITER_CHARACTER)=0
Move the cursor back by one iteration unit.
virtual size_t SetCurPosition(size_t pos)=0
Change the current string cursor.
virtual size_t Next(IterType what=ITER_CHARACTER)=0
Advance the cursor by one iteration unit.
static std::unique_ptr< StringIterator > Create()
Create a new iterator instance.
Definition string.cpp:935
IterType
Type of the iterator.
Definition string_base.h:17
@ ITER_WORD
Iterate over words.
Definition string_base.h:19
@ ITER_CHARACTER
Iterate over characters (or more exactly grapheme clusters).
Definition string_base.h:18
virtual void SetString(const char *s)=0
Set a new iteration string.
Control codes that are embedded in the translation strings.
@ SCC_ENCODED
Encoded string marker and sub-string parameter.
@ SCC_ENCODED_NUMERIC
Encoded numeric parameter.
@ SCC_ENCODED_STRING
Encoded string parameter.
@ SCC_ENCODED_INTERNAL
Encoded text from OpenTTD.
Functions related to debugging.
#define Debug(category, level, format_string,...)
Output a line of debugging information.
Definition debug.h:37
Error reporting related functions.
fluid_settings_t * settings
FluidSynth settings handle.
Functions related to the gfx engine.
Information about languages and their files.
const LanguageMetadata * _current_language
The currently loaded language.
Definition strings.cpp:54
std::unique_ptr< icu::Collator > _current_collator
Collator for the language currently in use.
Definition strings.cpp:59
Integer math functions.
A number of safeguards to prevent using unsafe methods.
Definition of base types and functions in a cross-platform compatible way.
static void StrMakeValid(T &dst, const char *str, const char *last, StringValidationSettings settings)
Copies the valid (UTF-8) characters from str up to last to the dst.
Definition string.cpp:125
bool ConvertHexToBytes(std::string_view hex, std::span< uint8_t > bytes)
Convert a hex-string to a byte-array, while validating it was actually hex.
Definition string.cpp:752
bool IsValidChar(char32_t key, CharSetFilter afilter)
Only allow certain keys.
Definition string.cpp:414
bool StrEqualsIgnoreCase(const std::string_view str1, const std::string_view str2)
Compares two string( view)s for equality, while ignoring the case of the characters.
Definition string.cpp:365
bool StrNaturalContains(const std::string_view str, const std::string_view value)
Checks if a string is contained in another string with a locale-aware comparison that is case sensiti...
Definition string.cpp:678
void StrMakeValidInPlace(char *str, const char *last, StringValidationSettings settings)
Scans the string for invalid characters and replaces then with a question mark '?' (if not ignored).
Definition string.cpp:196
void strecpy(std::span< char > dst, std::string_view src)
Copies characters from one buffer to another.
Definition string.cpp:59
std::string FormatArrayAsHex(std::span< const uint8_t > data)
Format a byte array into a continuous hex string.
Definition string.cpp:80
size_t Utf8StringLength(const char *s)
Get the length of an UTF-8 encoded string in number of characters and thus not the number of bytes th...
Definition string.cpp:377
bool StrStartsWithIgnoreCase(std::string_view str, const std::string_view prefix)
Check whether the given string starts with the given prefix, ignoring case.
Definition string.cpp:299
bool StrValid(std::span< const char > str)
Checks whether the given string is valid, i.e.
Definition string.cpp:245
static int ConvertHexNibbleToByte(char c)
Convert a single hex-nibble to a byte.
Definition string.cpp:733
static int ICUStringContains(const std::string_view str, const std::string_view value, bool case_insensitive)
Search if a string is contained in another string using the current locale.
Definition string.cpp:648
static std::string_view SkipGarbage(std::string_view str)
Skip some of the 'garbage' in the string that we don't want to use to sort on.
Definition string.cpp:586
static bool IsSccEncodedCode(char32_t c)
Test if a character is (only) part of an encoded string.
Definition string.cpp:97
size_t Utf8Decode(char32_t *c, const char *s)
Decode and consume the next UTF-8 encoded character.
Definition string.cpp:437
int StrNaturalCompare(std::string_view s1, std::string_view s2, bool ignore_garbage_at_front)
Compares two strings using case insensitive natural sort.
Definition string.cpp:607
bool StrNaturalContainsIgnoreCase(const std::string_view str, const std::string_view value)
Checks if a string is contained in another string with a locale-aware comparison that is case insensi...
Definition string.cpp:705
std::basic_string_view< char, CaseInsensitiveCharTraits > CaseInsensitiveStringView
Case insensitive string view.
Definition string.cpp:331
int StrCompareIgnoreCase(const std::string_view str1, const std::string_view str2)
Compares two string( view)s, while ignoring the case of the characters.
Definition string.cpp:352
bool StrEndsWithIgnoreCase(std::string_view str, const std::string_view suffix)
Check whether the given string ends with the given suffix, ignoring case.
Definition string.cpp:339
size_t Utf8TrimString(char *s, size_t maxlen)
Properly terminate an UTF8 string to some maximum length.
Definition string.cpp:526
void StrTrimInPlace(std::string &str)
Trim the spaces from given string in place, i.e.
Definition string.cpp:278
size_t Utf8Encode(T buf, char32_t c)
Encode a unicode character and place it in the buffer.
Definition string.cpp:478
static bool IsGarbageCharacter(char32_t c)
Test if a unicode character is considered garbage to be skipped.
Definition string.cpp:567
Functions related to low-level strings.
char32_t Utf16DecodeChar(const uint16_t *c)
Decode an UTF-16 character.
bool IsWhitespace(char32_t c)
Check whether UNICODE character is whitespace or not, i.e.
int8_t Utf8EncodedCharLen(char c)
Return the length of an UTF-8 encoded value based on a single char.
char * Utf8PrevChar(char *s)
Retrieve the previous UNICODE character in an UTF-8 encoded string.
int MacOSStringCompare(std::string_view s1, std::string_view s2)
Compares two strings using case insensitive natural sort.
int MacOSStringContains(const std::string_view str, const std::string_view value, bool case_insensitive)
Search if a string is contained in another string using the current locale.
Functions related to localized text support on OSX.
StringValidationSettings
Settings for the string validation.
Definition string_type.h:44
@ SVS_ALLOW_CONTROL_CODE
Allow the special control codes.
Definition string_type.h:48
@ SVS_REPLACE_TAB_CR_NL_WITH_SPACE
Replace tabs ('\t'), carriage returns ('\r') and newlines (' ') with spaces.
Definition string_type.h:54
@ SVS_ALLOW_NEWLINE
Allow newlines; replaces '\r ' with ' ' during processing.
Definition string_type.h:47
@ SVS_REPLACE_WITH_QUESTION_MARK
Replace the unknown/bad bits with question marks.
Definition string_type.h:46
CharSetFilter
Valid filter types for IsValidChar.
Definition string_type.h:24
@ CS_NUMERAL_SPACE
Only numbers and spaces.
Definition string_type.h:27
@ CS_HEXADECIMAL
Only hexadecimal characters.
Definition string_type.h:30
@ CS_NUMERAL
Only numeric ones.
Definition string_type.h:26
@ CS_NUMERAL_SIGNED
Only numbers and '-' for negative values.
Definition string_type.h:28
@ CS_ALPHA
Only alphabetic values.
Definition string_type.h:29
@ CS_ALPHANUMERAL
Both numeric and alphabetic and spaces and stuff.
Definition string_type.h:25
Functions related to laying out text on Win32.
Case insensitive implementation of the standard character type traits.
Definition string.cpp:306
char isocode[16]
the ISO code for the language (not country code)
Definition language.h:31
int Win32StringContains(const std::string_view str, const std::string_view value, bool case_insensitive)
Search if a string is contained in another string using the current locale.
Definition win32.cpp:479
declarations of functions for MS windows systems