OpenTTD Source 20260108-master-g8ba1860eaa
utf8.hpp
Go to the documentation of this file.
1/*
2 * This file is part of OpenTTD.
3 * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
4 * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
5 * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <https://www.gnu.org/licenses/old-licenses/gpl-2.0>.
6 */
7
10#ifndef UTF8_HPP
11#define UTF8_HPP
12
13#include <iterator>
14#include "bitmath_func.hpp"
15
16[[nodiscard]] std::pair<char[4], size_t> EncodeUtf8(char32_t c);
17[[nodiscard]] std::pair<size_t, char32_t> DecodeUtf8(std::string_view buf);
18
19/* Check if the given character is part of a UTF8 sequence */
20inline bool IsUtf8Part(char c)
21{
22 return GB(c, 6, 2) == 2;
23}
24
28class Utf8View {
29 std::string_view src;
30public:
31 Utf8View() = default;
32 Utf8View(std::string_view src) : src(src) {}
33
41 class iterator {
42 std::string_view src;
43 size_t position = 0;
44 public:
45 using value_type = char32_t;
46 using difference_type = std::ptrdiff_t;
47 using iterator_category = std::bidirectional_iterator_tag;
48 using pointer = void;
49 using reference = void;
50
51 iterator() = default;
52 iterator(std::string_view src, size_t position) : src(src), position(position) {}
53
54 size_t GetByteOffset() const
55 {
56 return this->position;
57 }
58
59 bool operator==(const iterator &rhs) const
60 {
61 assert(this->src.data() == rhs.src.data());
62 return this->position == rhs.position;
63 }
64
65 std::strong_ordering operator<=>(const iterator &rhs) const
66 {
67 assert(this->src.data() == rhs.src.data());
68 return this->position <=> rhs.position;
69 }
70
71 char32_t operator*() const
72 {
73 assert(this->position < this->src.size());
74 auto [len, c] = DecodeUtf8(this->src.substr(this->position));
75 return len > 0 ? c : '?';
76 }
77
78 iterator& operator++()
79 {
80 auto size = this->src.size();
81 assert(this->position < size);
82 do {
83 ++this->position;
84 } while (this->position < size && IsUtf8Part(this->src[this->position]));
85 return *this;
86 }
87
88 iterator operator++(int)
89 {
90 iterator result = *this;
91 ++*this;
92 return result;
93 }
94
95 iterator& operator--()
96 {
97 assert(this->position > 0);
98 do {
99 --this->position;
100 } while (this->position > 0 && IsUtf8Part(this->src[this->position]));
101 return *this;
102 }
103
104 iterator operator--(int)
105 {
106 iterator result = *this;
107 --*this;
108 return result;
109 }
110 };
111
112 iterator begin() const
113 {
114 return iterator(this->src, 0);
115 }
116
117 iterator end() const
118 {
119 return iterator(this->src, this->src.size());
120 }
121
122 iterator GetIterAtByte(size_t offset) const;
123};
124
125#endif /* UTF8_HPP */
Functions related to bit mathematics.
static constexpr uint GB(const T x, const uint8_t s, const uint8_t n)
Fetch n bits from x, started at bit s.
Bidirectional input iterator over codepoints.
Definition utf8.hpp:41
Constant span of UTF-8 encoded data.
Definition utf8.hpp:28
iterator GetIterAtByte(size_t offset) const
Create iterator pointing at codepoint, which occupies the byte position "offset".
Definition utf8.cpp:81
std::pair< size_t, char32_t > DecodeUtf8(std::string_view buf)
Decode a character from UTF-8.
Definition utf8.cpp:46
std::pair< char[4], size_t > EncodeUtf8(char32_t c)
Encode a character to UTF-8.
Definition utf8.cpp:19