OpenTTD Source 20250612-master-gb012d9e3dc
utf8.hpp
Go to the documentation of this file.
1/*
2 * This file is part of OpenTTD.
3 * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
4 * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
5 * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
6 */
7
12#ifndef UTF8_HPP
13#define UTF8_HPP
14
15#include <iterator>
16#include "bitmath_func.hpp"
17
18[[nodiscard]] std::pair<char[4], size_t> EncodeUtf8(char32_t c);
19[[nodiscard]] std::pair<size_t, char32_t> DecodeUtf8(std::string_view buf);
20
21/* Check if the given character is part of a UTF8 sequence */
22inline bool IsUtf8Part(char c)
23{
24 return GB(c, 6, 2) == 2;
25}
26
30class Utf8View {
31 std::string_view src;
32public:
33 Utf8View() = default;
34 Utf8View(std::string_view src) : src(src) {}
35
43 class iterator {
44 std::string_view src;
45 size_t position = 0;
46 public:
47 using value_type = char32_t;
48 using difference_type = std::ptrdiff_t;
49 using iterator_category = std::bidirectional_iterator_tag;
50 using pointer = void;
51 using reference = void;
52
53 iterator() = default;
54 iterator(std::string_view src, size_t position) : src(src), position(position) {}
55
56 size_t GetByteOffset() const
57 {
58 return this->position;
59 }
60
61 bool operator==(const iterator &rhs) const
62 {
63 assert(this->src.data() == rhs.src.data());
64 return this->position == rhs.position;
65 }
66
67 std::strong_ordering operator<=>(const iterator &rhs) const
68 {
69 assert(this->src.data() == rhs.src.data());
70 return this->position <=> rhs.position;
71 }
72
73 char32_t operator*() const
74 {
75 assert(this->position < this->src.size());
76 auto [len, c] = DecodeUtf8(this->src.substr(this->position));
77 return len > 0 ? c : '?';
78 }
79
80 iterator& operator++()
81 {
82 auto size = this->src.size();
83 assert(this->position < size);
84 do {
85 ++this->position;
86 } while (this->position < size && IsUtf8Part(this->src[this->position]));
87 return *this;
88 }
89
90 iterator operator++(int)
91 {
92 iterator result = *this;
93 ++*this;
94 return result;
95 }
96
97 iterator& operator--()
98 {
99 assert(this->position > 0);
100 do {
101 --this->position;
102 } while (this->position > 0 && IsUtf8Part(this->src[this->position]));
103 return *this;
104 }
105
106 iterator operator--(int)
107 {
108 iterator result = *this;
109 --*this;
110 return result;
111 }
112 };
113
114 iterator begin() const
115 {
116 return iterator(this->src, 0);
117 }
118
119 iterator end() const
120 {
121 return iterator(this->src, this->src.size());
122 }
123
124 iterator GetIterAtByte(size_t offset) const;
125};
126
127#endif /* UTF8_HPP */
Functions related to bit mathematics.
debug_inline static constexpr uint GB(const T x, const uint8_t s, const uint8_t n)
Fetch n bits from x, started at bit s.
Bidirectional input iterator over codepoints.
Definition utf8.hpp:43
Constant span of UTF-8 encoded data.
Definition utf8.hpp:30
iterator GetIterAtByte(size_t offset) const
Create iterator pointing at codepoint, which occupies the byte position "offset".
Definition utf8.cpp:83
std::pair< size_t, char32_t > DecodeUtf8(std::string_view buf)
Decode a character from UTF-8.
Definition utf8.cpp:48
std::pair< char[4], size_t > EncodeUtf8(char32_t c)
Encode a character to UTF-8.
Definition utf8.cpp:21