11#include "../core/alloc_func.hpp"
12#include "../core/endian_func.hpp"
13#include "../core/mem_func.hpp"
14#include "../error_func.h"
15#include "../string_func.h"
16#include "../table/control_codes.h"
21#include "../table/strgen_tables.h"
23#include "../safeguards.h"
29const char *
_file =
"(unknown file)";
31int _errors, _warnings, _show_todo;
34static const CmdStruct *ParseCommandString(
const char **str, std::string ¶m,
int *argno,
int *casei);
42 caseidx(caseidx), string(string)
54 name(name), english(english), index(index), line(line)
92 this->
strings[ls->index].swap(ls);
116 for (; *s !=
'\0'; s++) {
117 hash = std::rotl(hash, 3) ^ *s;
118 hash = (hash & 1 ? hash >> 1 ^ 0xDEADBEEF : hash >> 1);
141 s = ls->
name.c_str();
142 hash ^= i * 0x717239;
143 hash = (hash & 1 ? hash >> 1 ^ 0xDEADBEEF : hash >> 1);
147 while ((cs = ParseCommandString(&s, buf, &argno, &casei)) !=
nullptr) {
150 hash ^= (cs - _cmd_structs) * 0x1234567;
151 hash = (hash & 1 ? hash >> 1 ^ 0xF00BAA4 : hash >> 1);
170static const char *_cur_ident;
174static int _cur_argidx;
184 this->push_back(value);
194 this->push_back(value);
195 }
else if (value < 0x800) {
196 this->push_back(0xC0 +
GB(value, 6, 5));
197 this->push_back(0x80 +
GB(value, 0, 6));
198 }
else if (value < 0x10000) {
199 this->push_back(0xE0 +
GB(value, 12, 4));
200 this->push_back(0x80 +
GB(value, 6, 6));
201 this->push_back(0x80 +
GB(value, 0, 6));
202 }
else if (value < 0x110000) {
203 this->push_back(0xF0 +
GB(value, 18, 3));
204 this->push_back(0x80 +
GB(value, 12, 6));
205 this->push_back(0x80 +
GB(value, 6, 6));
206 this->push_back(0x80 +
GB(value, 0, 6));
208 StrgenWarning(
"Invalid unicode value U+0x{:X}", value);
213size_t Utf8Validate(
const char *s)
220 }
else if (
GB(s[0], 5, 3) == 6 && IsUtf8Part(s[1])) {
222 c =
GB(s[0], 0, 5) << 6 |
GB(s[1], 0, 6);
223 if (c >= 0x80)
return 2;
224 }
else if (
GB(s[0], 4, 4) == 14 && IsUtf8Part(s[1]) && IsUtf8Part(s[2])) {
226 c =
GB(s[0], 0, 4) << 12 |
GB(s[1], 0, 6) << 6 |
GB(s[2], 0, 6);
227 if (c >= 0x800)
return 3;
228 }
else if (
GB(s[0], 3, 5) == 30 && IsUtf8Part(s[1]) && IsUtf8Part(s[2]) && IsUtf8Part(s[3])) {
230 c =
GB(s[0], 0, 3) << 18 |
GB(s[1], 0, 6) << 12 |
GB(s[2], 0, 6) << 6 |
GB(s[3], 0, 6);
231 if (c >= 0x10000 && c <= 0x10FFFF)
return 4;
238void EmitSingleChar(
Buffer *buffer,
char *buf,
int value)
240 if (*buf !=
'\0') StrgenWarning(
"Ignoring trailing letters in command");
251bool ParseRelNum(
char **buf,
int *value,
int *offset)
253 const char *s = *buf;
257 while (*s ==
' ' || *s ==
'\t') s++;
262 int v = std::strtol(s, &end, 0);
263 if (end == s)
return false;
269 if (offset !=
nullptr && *end ==
':') {
272 *offset = std::strtol(s, &end, 0);
273 if (end == s)
return false;
280char *ParseWord(
char **buf)
284 while (*s ==
' ' || *s ==
'\t') s++;
285 if (*s ==
'\0')
return nullptr;
291 if (*s ==
'\0')
break;
302 if (*s ==
'\0')
break;
303 if (*s ==
' ' || *s ==
'\t') {
315static int TranslateArgumentIdx(
int arg,
int offset = 0);
317static void EmitWordList(
Buffer *buffer,
const std::vector<const char *> &words, uint nw)
320 constexpr uint MAX_WORD_LENGTH = UINT8_MAX - 2;
323 for (uint i = 0; i < nw; i++) {
324 size_t len = strlen(words[i]) + 1;
325 if (len >= UINT8_MAX) StrgenFatal(
"WordList {}/{} string '{}' too long, max bytes {}", i + 1, nw, words[i], MAX_WORD_LENGTH);
326 buffer->
AppendByte(
static_cast<uint8_t
>(len));
328 for (uint i = 0; i < nw; i++) {
329 for (uint j = 0; words[i][j] !=
'\0'; j++) buffer->
AppendByte(words[i][j]);
334void EmitPlural(
Buffer *buffer,
char *buf,
int)
336 int argidx = _cur_argidx;
339 std::vector<const char *> words(std::max(expected,
MAX_PLURALS),
nullptr);
343 if (!ParseRelNum(&buf, &argidx, &offset)) argidx--;
345 const CmdStruct *cmd = _cur_pcs.consuming_commands[argidx];
348 if (cmd ==
nullptr || cmd->default_plural_offset < 0) {
349 StrgenFatal(
"Command '{}' has no (default) plural position", cmd ==
nullptr ?
"<empty>" : cmd->cmd);
351 offset = cmd->default_plural_offset;
356 words[nw] = ParseWord(&buf);
357 if (words[nw] ==
nullptr)
break;
361 StrgenFatal(
"{}: No plural words", _cur_ident);
364 if (expected != nw) {
366 StrgenFatal(
"{}: Invalid number of plural forms. Expecting {}, found {}.", _cur_ident,
369 if ((_show_todo & 2) != 0) StrgenWarning(
"'{}' is untranslated. Tweaking english string to allow compilation for plural forms", _cur_ident);
373 for (; nw < expected; nw++) {
374 words[nw] = words[nw - 1];
382 buffer->
AppendByte(TranslateArgumentIdx(argidx, offset));
383 EmitWordList(buffer, words, nw);
386void EmitGender(
Buffer *buffer,
char *buf,
int)
388 int argidx = _cur_argidx;
397 if (nw >=
MAX_NUM_GENDERS) StrgenFatal(
"G argument '{}' invalid", buf);
407 ParseRelNum(&buf, &argidx, &offset);
409 const CmdStruct *cmd = _cur_pcs.consuming_commands[argidx];
410 if (cmd ==
nullptr || (cmd->flags &
C_GENDER) == 0) {
411 StrgenFatal(
"Command '{}' can't have a gender", cmd ==
nullptr ?
"<empty>" : cmd->cmd);
415 words[nw] = ParseWord(&buf);
416 if (words[nw] ==
nullptr)
break;
418 if (nw !=
_lang.
num_genders) StrgenFatal(
"Bad # of arguments for gender command");
420 assert(
IsInsideBS(cmd->value, SCC_CONTROL_START, UINT8_MAX));
422 buffer->
AppendByte(TranslateArgumentIdx(argidx, offset));
423 EmitWordList(buffer, words, nw);
427static const CmdStruct *FindCmd(
const char *s,
int len)
429 for (
const auto &cs : _cmd_structs) {
430 if (strncmp(cs.cmd, s, len) == 0 && cs.cmd[len] ==
'\0')
return &cs;
435static uint ResolveCaseName(
const char *str,
size_t len)
439 len = std::min(
lengthof(case_str) - 1, len);
440 memcpy(case_str, str, len);
441 case_str[len] =
'\0';
444 if (case_idx >=
MAX_NUM_CASES) StrgenFatal(
"Invalid case-name '{}'", case_str);
451static const CmdStruct *ParseCommandString(
const char **str, std::string ¶m,
int *argno,
int *casei)
453 const char *s = *str, *start;
460 for (; *s !=
'{'; s++) {
461 if (*s ==
'\0')
return nullptr;
465 if (*s >=
'0' && *s <=
'9') {
468 *argno = std::strtoul(s, &end, 0);
469 if (*end !=
':') StrgenFatal(
"missing arg #");
477 }
while (c !=
'}' && c !=
' ' && c !=
'=' && c !=
'.' && c != 0);
479 const CmdStruct *cmd = FindCmd(start, s - start - 1);
480 if (cmd ==
nullptr) {
481 std::string command(start, s - start - 1);
482 StrgenError(
"Undefined command '{}'", command);
487 const char *casep = s;
489 if (!(cmd->flags &
C_CASE)) {
490 StrgenFatal(
"Command '{}' can't have a case", cmd->cmd);
495 }
while (c !=
'}' && c !=
' ' && c !=
'\0');
496 *casei = ResolveCaseName(casep, s - casep - 1);
500 StrgenError(
"Missing }} from command '{}'", start);
513 StrgenError(
"Missing }} from command '{}'", start);
533 data(data), file(file), master(master), translation(translation)
548 const CmdStruct *ar = ParseCommandString(&s, param, &argno, &casei);
550 if (ar ==
nullptr)
break;
553 if (argno != -1 && ar->consumes == 0) StrgenFatal(
"Non consumer param can't have a paramindex");
556 if (argno != -1) argidx = argno;
557 if (argidx < 0 || (uint)argidx >= p.consuming_commands.max_size()) StrgenFatal(
"invalid param idx {}", argidx);
558 if (p.consuming_commands[argidx] !=
nullptr && p.consuming_commands[argidx] != ar) StrgenFatal(
"duplicate param idx {}", argidx);
560 p.consuming_commands[argidx++] = ar;
562 p.non_consuming_commands.emplace_back(
CmdPair{ar, std::move(param)});
572 if (a ==
nullptr)
return nullptr;
574 if (strcmp(a->cmd,
"STRING1") == 0 ||
575 strcmp(a->cmd,
"STRING2") == 0 ||
576 strcmp(a->cmd,
"STRING3") == 0 ||
577 strcmp(a->cmd,
"STRING4") == 0 ||
578 strcmp(a->cmd,
"STRING5") == 0 ||
579 strcmp(a->cmd,
"STRING6") == 0 ||
580 strcmp(a->cmd,
"STRING7") == 0 ||
581 strcmp(a->cmd,
"RAW_STRING") == 0) {
582 return FindCmd(
"STRING", 6);
589static bool CheckCommandsMatch(
const char *a,
const char *b,
const char *name)
603 if (templ.non_consuming_commands.max_size() != lang.non_consuming_commands.max_size()) {
604 StrgenWarning(
"{}: template string and language string have a different # of commands", name);
608 for (
auto &templ_nc : templ.non_consuming_commands) {
611 for (
auto &lang_nc : lang.non_consuming_commands) {
612 if (templ_nc.cmd == lang_nc.cmd && templ_nc.param == lang_nc.param) {
614 lang_nc.cmd =
nullptr;
621 StrgenWarning(
"{}: command '{}' exists in template file but not in language file", name, templ_nc.cmd->cmd);
628 for (uint i = 0; i < templ.consuming_commands.max_size(); i++) {
629 if (TranslateCmdForCompare(templ.consuming_commands[i]) != lang.consuming_commands[i]) {
630 StrgenWarning(
"{}: Param idx #{} '{}' doesn't match with template command '{}'", name, i,
631 lang.consuming_commands[i] ==
nullptr ?
"<empty>" : TranslateCmdForCompare(lang.consuming_commands[i])->cmd,
632 templ.consuming_commands[i] == nullptr ?
"<empty>" : templ.consuming_commands[i]->cmd);
640void StringReader::HandleString(
char *str)
643 if (str[1] ==
'#' && str[2] !=
'#') this->
HandlePragma(str + 2);
648 if (*str ==
';' || *str ==
' ' || *str ==
'\0')
return;
650 char *s = strchr(str,
':');
652 StrgenError(
"Line has no ':' delimiter");
659 for (t = s; t > str && (t[-1] ==
' ' || t[-1] ==
'\t'); t--) {}
665 for (tmp = s; *tmp !=
'\0';) {
666 size_t len = Utf8Validate(tmp);
667 if (len == 0) StrgenFatal(
"Invalid UTF-8 sequence in '{}'", s);
673 (c >= 0xE000 && c <= 0xF8FF) ||
674 (c >= 0xFFF0 && c <= 0xFFFF)) {
675 StrgenFatal(
"Unwanted UTF-8 character U+{:04X} in sequence '{}'", (
int)c, s);
683 char *casep = strchr(str,
'.');
684 if (casep !=
nullptr) *casep++ =
'\0';
690 if (casep !=
nullptr) {
691 StrgenError(
"Cases in the base translation are not supported.");
695 if (ent !=
nullptr) {
696 StrgenError(
"String name '{}' is used multiple times", str);
700 if (this->
data.
strings[this->data.next_string_id] !=
nullptr) {
701 StrgenError(
"String ID 0x{:X} for '{}' already in use by '{}'", this->
data.
next_string_id, str, this->data.strings[this->data.next_string_id]->name);
708 if (ent ==
nullptr) {
709 StrgenWarning(
"String name '{}' does not exist in master file", str);
713 if (!ent->
translated.empty() && casep ==
nullptr) {
714 StrgenError(
"String name '{}' is used multiple times", str);
719 if (!CheckCommandsMatch(s, ent->
english.c_str(), str))
return;
721 if (casep !=
nullptr) {
722 ent->
translated_cases.emplace_back(ResolveCaseName(casep, strlen(casep)), s);
735 if (!memcmp(str,
"plural ", 7)) {
741 StrgenFatal(
"unknown pragma '{}'", str);
745static void StripTrailingWhitespace(std::string &str)
747 str.erase(str.find_last_not_of(
"\r\n ") + 1);
752 _warnings = _errors = 0;
769 std::optional<std::string> line = this->
ReadLine();
770 if (!line.has_value())
return;
772 StripTrailingWhitespace(line.value());
773 this->HandleString(line.value().data());
778 StrgenError(
"Too many strings, maximum allowed is {}", this->
data.
max_strings);
790 if (data.
strings[i] !=
nullptr) {
799static int TranslateArgumentIdx(
int argidx,
int offset)
803 if (argidx < 0 || (uint)argidx >= _cur_pcs.consuming_commands.max_size()) {
804 StrgenFatal(
"invalid argidx {}", argidx);
806 const CmdStruct *cs = _cur_pcs.consuming_commands[argidx];
807 if (cs !=
nullptr && cs->consumes <= offset) {
808 StrgenFatal(
"invalid argidx offset {}:{}", argidx, offset);
811 if (_cur_pcs.consuming_commands[argidx] ==
nullptr) {
812 StrgenFatal(
"no command for this argidx {}", argidx);
815 for (
int i = sum = 0; i < argidx; i++) {
816 cs = _cur_pcs.consuming_commands[i];
818 sum += (cs !=
nullptr) ? cs->consumes : 1;
824static void PutArgidxCommand(
Buffer *buffer)
827 buffer->
AppendByte(TranslateArgumentIdx(_cur_argidx));
831static void PutCommandString(
Buffer *buffer,
const char *str)
835 while (*str !=
'\0') {
845 const CmdStruct *cs = ParseCommandString(&str, param, &argno, &casei);
846 if (cs ==
nullptr)
break;
854 if (cs->consumes > 0) {
856 if (argno != -1 && argno != _cur_argidx) {
858 PutArgidxCommand(buffer);
862 cs = _cur_pcs.consuming_commands[_cur_argidx++];
864 StrgenFatal(
"{}: No argument exists at position {}", _cur_ident, _cur_argidx - 1);
868 cs->proc(buffer, param.data(), cs->value);
880 if (length >= 0x4000) {
881 StrgenFatal(
"string too long");
884 if (length >= 0xC0) {
885 buffer[offs++] = (length >> 8) | 0xC0;
887 buffer[offs++] = length & 0xFF;
888 this->
Write((uint8_t*)buffer, offs);
897 std::vector<uint> in_use;
898 for (
size_t tab = 0; tab < data.
tabs; tab++) {
904 for (uint j = 0; j != in_use[tab]; j++) {
918 for (
size_t tab = 0; tab < data.
tabs; tab++) {
919 for (uint j = 0; j != in_use[tab]; j++) {
921 const std::string *cmdp;
929 _cur_ident = ls->
name.c_str();
933 if (_show_todo > 0 && ls->
translated.empty()) {
934 if ((_show_todo & 2) != 0) {
935 StrgenWarning(
"'{}' is untranslated", ls->
name);
937 if ((_show_todo & 1) != 0) {
938 const char *s =
"<TODO> ";
944 _cur_pcs = ExtractCommandString(ls->
english.c_str(),
false);
966 uint pos = (uint)buffer.size();
970 PutCommandString(&buffer, c.string.c_str());
973 uint size = (uint)buffer.size() - (pos + 2);
974 buffer[pos + 0] =
GB(size, 8, 8);
975 buffer[pos + 1] =
GB(size, 0, 8);
979 if (!cmdp->empty()) PutCommandString(&buffer, cmdp->c_str());
982 this->
Write(buffer.data(), buffer.size());
debug_inline constexpr bool HasBit(const T x, const uint8_t y)
Checks if a bit in a value is set.
debug_inline static constexpr uint GB(const T x, const uint8_t s, const uint8_t n)
Fetch n bits from x, started at bit s.
static const uint8_t MAX_NUM_GENDERS
Maximum number of supported genders.
static const uint8_t CASE_GENDER_LEN
The (maximum) length of a case/gender string.
static const uint8_t MAX_NUM_CASES
Maximum number of supported cases.
constexpr bool IsInsideBS(const T x, const size_t base, const size_t size)
Checks if a value is between a window started at some base point.
void MemSetT(T *ptr, uint8_t value, size_t num=1)
Type-safe version of memset().
#define lengthof(array)
Return the length of an fixed size array.
Structures related to strgen.
LanguagePackHeader _lang
Header information about a language.
int _cur_line
The current line we're parsing in the input file.
const char * _file
The filename of the input, so we can refer to it in errors/warnings.
LanguagePackHeader _lang
Header information about a language.
static bool _translated
Whether the current language is not the master language.
int _cur_line
The current line we're parsing in the input file.
static bool _translation
Is the current file actually a translation or not.
const char * _file
The filename of the input, so we can refer to it in errors/warnings.
static const int MAX_PLURALS
The maximum number of plurals.
static const PluralForm _plural_forms[]
All plural forms used.
@ C_GENDER
These commands support genders.
@ C_CASE
These commands support cases.
@ C_DONTCOUNT
These commands aren't counted for comparison.
void strecpy(std::span< char > dst, std::string_view src)
Copies characters from one buffer to another.
size_t Utf8Decode(char32_t *c, const char *s)
Decode and consume the next UTF-8 encoded character.
static const uint TAB_SIZE
Number of strings per StringTab.
The buffer for writing a single string.
void AppendUtf8(uint32_t value)
Add an Unicode character encoded in UTF-8 to the buffer.
void AppendByte(uint8_t value)
Convenience method for adding a byte.
Container for the different cases of a string.
Case(int caseidx, const std::string &string)
Create a new case.
Information about a single string.
int line
Line of string in source-file.
LangString(const std::string &name, const std::string &english, size_t index, int line)
Create a new string.
std::string english
English text.
std::vector< Case > translated_cases
Cases of the translation.
std::string translated
Translated text.
void FreeTranslation()
Free all data related to the translation.
std::string name
Name of the string.
virtual void WriteHeader(const LanguagePackHeader *header)=0
Write the header metadata.
virtual void WriteLength(uint length)
Write the length as a simple gamma.
virtual void Write(const uint8_t *buffer, size_t length)=0
Write a number of bytes.
virtual void WriteLang(const StringData &data)
Actually write the language.
Information about the currently known strings.
size_t tabs
The number of 'tabs' of strings.
uint CountInUse(uint tab) const
Count the number of tab elements that are in use.
uint VersionHashStr(uint hash, const char *s) const
Create a compound hash.
std::vector< std::unique_ptr< LangString > > strings
List of all known strings.
size_t max_strings
The maximum number of strings.
void Add(std::unique_ptr< LangString > ls)
Add a newly created LangString.
size_t next_string_id
The next string ID to allocate.
uint Version() const
Make a hash of the file to get a unique "version number".
LangString * Find(const std::string_view s)
Find a LangString based on the string name.
void FreeTranslation()
Free all data related to the translation.
StringData(size_t tabs)
Create a new string data container.
std::unordered_map< std::string_view, LangString * > name_to_string
Lookup table for the strings.
const std::string file
The file we are reading.
StringReader(StringData &data, const std::string &file, bool master, bool translation)
Prepare reading.
StringData & data
The data to fill during reading.
virtual void ParseFile()
Start parsing the file.
bool translation
Are we reading a translation, implies !master. However, the base translation will have this false.
virtual std::optional< std::string > ReadLine()=0
Read a single line from the source of strings.
virtual void HandlePragma(char *str)
Handle the pragma of the file.
bool master
Are we reading the master file?