14#include "../safeguards.h"
21[[nodiscard]] std::pair<char[4], size_t>
EncodeUtf8(
char32_t c)
23 std::pair<char[4], size_t> result{};
24 auto &[buf, len] = result;
27 }
else if (c < 0x800) {
28 buf[len++] = 0xC0 +
GB(c, 6, 5);
29 buf[len++] = 0x80 +
GB(c, 0, 6);
30 }
else if (c < 0x10000) {
31 buf[len++] = 0xE0 +
GB(c, 12, 4);
32 buf[len++] = 0x80 +
GB(c, 6, 6);
33 buf[len++] = 0x80 +
GB(c, 0, 6);
34 }
else if (c < 0x110000) {
35 buf[len++] = 0xF0 +
GB(c, 18, 3);
36 buf[len++] = 0x80 +
GB(c, 12, 6);
37 buf[len++] = 0x80 +
GB(c, 6, 6);
38 buf[len++] = 0x80 +
GB(c, 0, 6);
21[[nodiscard]] std::pair<char[4], size_t>
EncodeUtf8(
char32_t c) {
…}
48[[nodiscard]] std::pair<size_t, char32_t>
DecodeUtf8(std::string_view buf)
50 if (buf.size() >= 1 && !
HasBit(buf[0], 7)) {
54 }
else if (buf.size() >= 2 &&
GB(buf[0], 5, 3) == 6) {
55 if (IsUtf8Part(buf[1])) {
57 char32_t c =
GB(buf[0], 0, 5) << 6 |
GB(buf[1], 0, 6);
58 if (c >= 0x80)
return {2, c};
60 }
else if (buf.size() >= 3 &&
GB(buf[0], 4, 4) == 14) {
61 if (IsUtf8Part(buf[1]) && IsUtf8Part(buf[2])) {
63 char32_t c =
GB(buf[0], 0, 4) << 12 |
GB(buf[1], 0, 6) << 6 |
GB(buf[2], 0, 6);
64 if (c >= 0x800)
return {3, c};
66 }
else if (buf.size() >= 4 &&
GB(buf[0], 3, 5) == 30) {
67 if (IsUtf8Part(buf[1]) && IsUtf8Part(buf[2]) && IsUtf8Part(buf[3])) {
69 char32_t c =
GB(buf[0], 0, 3) << 18 |
GB(buf[1], 0, 6) << 12 |
GB(buf[2], 0, 6) << 6 |
GB(buf[3], 0, 6);
70 if (c >= 0x10000 && c <= 0x10FFFF)
return {4, c};
48[[nodiscard]] std::pair<size_t, char32_t>
DecodeUtf8(std::string_view buf) {
…}
85 assert(offset <= this->src.size());
86 if (offset >= this->src.size())
return this->end();
89 auto it =
iterator(this->src, offset + 1);
debug_inline constexpr bool HasBit(const T x, const uint8_t y)
Checks if a bit in a value is set.
debug_inline static constexpr uint GB(const T x, const uint8_t s, const uint8_t n)
Fetch n bits from x, started at bit s.
Bidirectional input iterator over codepoints.
iterator GetIterAtByte(size_t offset) const
Create iterator pointing at codepoint, which occupies the byte position "offset".
std::pair< size_t, char32_t > DecodeUtf8(std::string_view buf)
Decode a character from UTF-8.
std::pair< char[4], size_t > EncodeUtf8(char32_t c)
Encode a character to UTF-8.
Handling of UTF-8 encoded data.