OpenTTD Source 20260108-master-g8ba1860eaa
utf8.cpp
Go to the documentation of this file.
1/*
2 * This file is part of OpenTTD.
3 * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
4 * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
5 * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <https://www.gnu.org/licenses/old-licenses/gpl-2.0>.
6 */
7
10#include "../stdafx.h"
11#include "utf8.hpp"
12#include "../safeguards.h"
13
19[[nodiscard]] std::pair<char[4], size_t> EncodeUtf8(char32_t c)
20{
21 std::pair<char[4], size_t> result{};
22 auto &[buf, len] = result;
23 if (c < 0x80) {
24 buf[len++] = c;
25 } else if (c < 0x800) {
26 buf[len++] = 0xC0 + GB(c, 6, 5);
27 buf[len++] = 0x80 + GB(c, 0, 6);
28 } else if (c < 0x10000) {
29 buf[len++] = 0xE0 + GB(c, 12, 4);
30 buf[len++] = 0x80 + GB(c, 6, 6);
31 buf[len++] = 0x80 + GB(c, 0, 6);
32 } else if (c < 0x110000) {
33 buf[len++] = 0xF0 + GB(c, 18, 3);
34 buf[len++] = 0x80 + GB(c, 12, 6);
35 buf[len++] = 0x80 + GB(c, 6, 6);
36 buf[len++] = 0x80 + GB(c, 0, 6);
37 }
38 return result;
39}
40
46[[nodiscard]] std::pair<size_t, char32_t> DecodeUtf8(std::string_view buf)
47{
48 if (buf.size() >= 1 && !HasBit(buf[0], 7)) {
49 /* Single byte character: 0xxxxxxx */
50 char32_t c = buf[0];
51 return {1, c};
52 } else if (buf.size() >= 2 && GB(buf[0], 5, 3) == 6) {
53 if (IsUtf8Part(buf[1])) {
54 /* Double byte character: 110xxxxx 10xxxxxx */
55 char32_t c = GB(buf[0], 0, 5) << 6 | GB(buf[1], 0, 6);
56 if (c >= 0x80) return {2, c};
57 }
58 } else if (buf.size() >= 3 && GB(buf[0], 4, 4) == 14) {
59 if (IsUtf8Part(buf[1]) && IsUtf8Part(buf[2])) {
60 /* Triple byte character: 1110xxxx 10xxxxxx 10xxxxxx */
61 char32_t c = GB(buf[0], 0, 4) << 12 | GB(buf[1], 0, 6) << 6 | GB(buf[2], 0, 6);
62 if (c >= 0x800) return {3, c};
63 }
64 } else if (buf.size() >= 4 && GB(buf[0], 3, 5) == 30) {
65 if (IsUtf8Part(buf[1]) && IsUtf8Part(buf[2]) && IsUtf8Part(buf[3])) {
66 /* 4 byte character: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
67 char32_t c = GB(buf[0], 0, 3) << 18 | GB(buf[1], 0, 6) << 12 | GB(buf[2], 0, 6) << 6 | GB(buf[3], 0, 6);
68 if (c >= 0x10000 && c <= 0x10FFFF) return {4, c};
69 }
70 }
71 return {};
72}
73
82{
83 assert(offset <= this->src.size());
84 if (offset >= this->src.size()) return this->end();
85
86 /* Sanitize iterator to point to the start of a codepoint */
87 auto it = iterator(this->src, offset + 1);
88 --it;
89 return it;
90}
static constexpr uint GB(const T x, const uint8_t s, const uint8_t n)
Fetch n bits from x, started at bit s.
constexpr bool HasBit(const T x, const uint8_t y)
Checks if a bit in a value is set.
Bidirectional input iterator over codepoints.
Definition utf8.hpp:41
iterator GetIterAtByte(size_t offset) const
Create iterator pointing at codepoint, which occupies the byte position "offset".
Definition utf8.cpp:81
std::pair< size_t, char32_t > DecodeUtf8(std::string_view buf)
Decode a character from UTF-8.
Definition utf8.cpp:46
std::pair< char[4], size_t > EncodeUtf8(char32_t c)
Encode a character to UTF-8.
Definition utf8.cpp:19
Handling of UTF-8 encoded data.