From 48ea1e8ae70873c624e7ae5b1246d40b73bff907 Mon Sep 17 00:00:00 2001 From: Michael Zhao <44533763+Pistonight@users.noreply.github.com> Date: Tue, 16 Jun 2026 03:48:34 -0700 Subject: [PATCH] implement UTF-8 for TodStringFile, ImageFont, DescParser --- src/Sexy.TodLib/TodStringFile.cpp | 43 ++++++++----- src/SexyAppFramework/Common.cpp | 94 ++++++++++++++++++----------- src/SexyAppFramework/DescParser.cpp | 68 +++++++++++++-------- src/SexyAppFramework/DescParser.h | 3 +- src/SexyAppFramework/Encoding.cpp | 88 +++++++++++++++++++++++++++ src/SexyAppFramework/Encoding.h | 16 +++++ src/SexyAppFramework/Font.cpp | 4 +- src/SexyAppFramework/ImageFont.cpp | 4 ++ src/main.cpp | 3 +- 9 files changed, 245 insertions(+), 78 deletions(-) create mode 100644 src/SexyAppFramework/Encoding.cpp create mode 100644 src/SexyAppFramework/Encoding.h diff --git a/src/Sexy.TodLib/TodStringFile.cpp b/src/Sexy.TodLib/TodStringFile.cpp index e0b4bd1e..13d96954 100644 --- a/src/Sexy.TodLib/TodStringFile.cpp +++ b/src/Sexy.TodLib/TodStringFile.cpp @@ -1,8 +1,9 @@ #include "TodDebug.h" -#include "TodCommon.h" -#include "TodStringFile.h" #include "../PakLib/PakInterface.h" +#include "../SexyAppFramework/Encoding.h" #include "../SexyAppFramework/Font.h" +#include "TodCommon.h" +#include "TodStringFile.h" int gTodStringFormatCount; TodStringListFormat *gTodStringFormats; @@ -125,8 +126,10 @@ bool TodStringListReadItems(const char *theFileText) } } + bool TodStringListReadFile(const char *theFileName) { + TodTrace("[TodLib] - Loading String File '%s'", theFileName); PFILE *pFile = p_fopen(theFileName, "rb"); if (pFile == nullptr) { @@ -135,24 +138,34 @@ bool TodStringListReadFile(const char *theFileName) } p_fseek(pFile, 0, SEEK_END); - int aSize = p_ftell(pFile); + auto aSize = p_ftell(pFile); p_fseek(pFile, 0, SEEK_SET); - char *aFileText = new char[aSize + 1]; - bool aSuccess = true; - if (p_fread(aFileText, sizeof(char), aSize, pFile) <= 0) + + if (aSize <= 0) + { + TodTrace("[TodLib] - String file is empty: '%s'", theFileName); + p_fclose(pFile); + return false; + } + + std::string aBytes; + aBytes.resize(aSize); + if (aSize > 0 && p_fread(&aBytes[0], sizeof(char), aSize, pFile) <= 0) { TodTrace("[TodLib] - Failed to read '%s'", theFileName); - aSuccess = false; - } - aFileText[aSize] = '\0'; - std::string aFixedContent = ANSIToUTF8(aFileText); - if (aSuccess) - { - aSuccess = TodStringListReadItems(aFixedContent.c_str()); + p_fclose(pFile); + return false; } + p_fclose(pFile); - delete[] aFileText; + std::optional aDecoded = Sexy::ConvertToUtf8IfNeeded(aBytes); + bool aSuccess = aDecoded + ? TodStringListReadItems(aDecoded->c_str()) + : TodStringListReadItems(aBytes.c_str()); + if (!aSuccess) { + TodTrace("[TodLib] - Failed to read list items from '%s'", theFileName); + } return aSuccess; } @@ -190,7 +203,7 @@ SexyString TodStringTranslate(const SexyChar *theString) { if (theString != nullptr) { - int aLen = sizeof(theString) / sizeof(theString[0]); + int aLen = StringLength(theString); if (aLen >= 3 && theString[0] == '[') { SexyString aName = SexyCharToString(theString, 1, aLen - 2); diff --git a/src/SexyAppFramework/Common.cpp b/src/SexyAppFramework/Common.cpp index 7b6144e3..2fcce4d0 100644 --- a/src/SexyAppFramework/Common.cpp +++ b/src/SexyAppFramework/Common.cpp @@ -1,4 +1,4 @@ -#include "Common.h" +#include "Common.h" #include "MTRand.h" #include #include @@ -17,7 +17,7 @@ std::string gAppDataFolder = std::filesystem::path(std::getenv("HOME")).string() #else std::string gAppDataFolder = std::filesystem::path(std::getenv("HOME")).string() + "/.config/"; #endif -} +} // namespace Sexy int Sexy::Rand() { @@ -51,7 +51,7 @@ void Sexy::SetAppDataFolder(const std::string &thePath) std::string Sexy::URLEncode(const std::string &theString) { - char *aHexChars = "0123456789ABCDEF"; + const char *aHexChars = "0123456789ABCDEF"; std::string aString; @@ -85,8 +85,16 @@ std::string Sexy::StringToUpper(const std::string &theString) { std::string aString; - for (unsigned i = 0; i < theString.length(); i++) - aString += toupper(theString[i]); + auto it = theString.begin(); + auto end = theString.end(); + while (it != end) + { + uint32_t aCodepoint = utf8::next(it, end); + // only touch ascii + if (aCodepoint < 0x80) + aCodepoint = toupper(aCodepoint); + utf8::append(aCodepoint, aString); + } return aString; } @@ -95,8 +103,16 @@ std::string Sexy::StringToLower(const std::string &theString) { std::string aString; - for (unsigned i = 0; i < theString.length(); i++) - aString += tolower(theString[i]); + auto it = theString.begin(); + auto end = theString.end(); + while (it != end) + { + uint32_t aCodepoint = utf8::next(it, end); + // only touch ascii + if (aCodepoint < 0x80) + aCodepoint = tolower(aCodepoint); + utf8::append(aCodepoint, aString); + } return aString; } @@ -113,15 +129,30 @@ std::string Sexy::SexyStringToString(const SexyString &theString) std::string Sexy::Trim(const std::string &theString) { - int aStartPos = 0; - while (aStartPos < (int)theString.length() && isspace(theString[aStartPos])) - aStartPos++; + auto aStart = theString.begin(); + auto anEnd = theString.end(); - int anEndPos = theString.length() - 1; - while (anEndPos >= 0 && isspace(theString[anEndPos])) - anEndPos--; + while (aStart != anEnd) + { + auto it = aStart; + uint32_t aCodepoint = utf8::next(it, anEnd); + // treat non-ascii as not whitespace + if (aCodepoint >= 0x80 || !isspace(aCodepoint)) + break; + aStart = it; + } + + while (anEnd != aStart) + { + auto it = anEnd; + uint32_t aCodepoint = utf8::prior(it, aStart); + // treat non-ascii as not whitespace + if (aCodepoint >= 0x80 || !isspace(aCodepoint)) + break; + anEnd = it; + } - return theString.substr(aStartPos, anEndPos - aStartPos + 1); + return std::string(aStart, anEnd); } bool Sexy::StringToInt(const std::string theString, int *theIntVal) @@ -378,24 +409,17 @@ bool Sexy::AllowAllAccess(const std::string &theFileName) if (aLib == NULL) return false; - BOOL(WINAPI * fnSetFileSecurity)( - LPCTSTR lpFileName, SECURITY_INFORMATION SecurityInformation, PSECURITY_DESCRIPTOR pSecurityDescriptor); - BOOL(WINAPI * fnSetSecurityDescriptorDacl)( - PSECURITY_DESCRIPTOR pSecurityDescriptor, BOOL bDaclPresent, PACL pDacl, BOOL bDaclDefaulted); + BOOL(WINAPI * fnSetFileSecurity)(LPCTSTR lpFileName, SECURITY_INFORMATION SecurityInformation, + PSECURITY_DESCRIPTOR pSecurityDescriptor); + BOOL(WINAPI * fnSetSecurityDescriptorDacl)(PSECURITY_DESCRIPTOR pSecurityDescriptor, BOOL bDaclPresent, PACL pDacl, + BOOL bDaclDefaulted); BOOL(WINAPI * fnInitializeSecurityDescriptor)(PSECURITY_DESCRIPTOR pSecurityDescriptor, DWORD dwRevision); - BOOL(WINAPI * fnAllocateAndInitializeSid)(PSID_IDENTIFIER_AUTHORITY pIdentifierAuthority, - BYTE nSubAuthorityCount, - DWORD dwSubAuthority0, - DWORD dwSubAuthority1, - DWORD dwSubAuthority2, - DWORD dwSubAuthority3, - DWORD dwSubAuthority4, - DWORD dwSubAuthority5, - DWORD dwSubAuthority6, - DWORD dwSubAuthority7, - PSID * pSid); - DWORD(WINAPI * fnSetEntriesInAcl)( - ULONG cCountOfExplicitEntries, PEXPLICIT_ACCESS pListOfExplicitEntries, PACL OldAcl, PACL * NewAcl); + BOOL(WINAPI * fnAllocateAndInitializeSid)(PSID_IDENTIFIER_AUTHORITY pIdentifierAuthority, BYTE nSubAuthorityCount, + DWORD dwSubAuthority0, DWORD dwSubAuthority1, DWORD dwSubAuthority2, + DWORD dwSubAuthority3, DWORD dwSubAuthority4, DWORD dwSubAuthority5, + DWORD dwSubAuthority6, DWORD dwSubAuthority7, PSID * pSid); + DWORD(WINAPI * fnSetEntriesInAcl)(ULONG cCountOfExplicitEntries, PEXPLICIT_ACCESS pListOfExplicitEntries, + PACL OldAcl, PACL * NewAcl); PVOID(WINAPI * fnFreeSid)(PSID pSid); *(void **)&fnSetFileSecurity = (void *)GetProcAddress(aLib, "SetFileSecurityA"); @@ -458,7 +482,6 @@ bool Sexy::AllowAllAccess(const std::string &theFileName) #else return false; #endif // WIN32 - } SexyString Sexy::SexyStringFromChar(SexyChar theChar) @@ -668,7 +691,8 @@ std::string Sexy::AddTrailingSlash(const std::string &theDirectory, bool backSla uint64_t Sexy::GetLastWriteFileDate(const std::string &theFileName) { auto ftime = std::filesystem::last_write_time(theFileName); - auto sctp = std::chrono::time_point_cast(ftime - std::filesystem::file_time_type::clock::now() + std::chrono::system_clock::now()); + auto sctp = std::chrono::time_point_cast( + ftime - std::filesystem::file_time_type::clock::now() + std::chrono::system_clock::now()); return std::chrono::system_clock::to_time_t(sctp); } @@ -763,8 +787,8 @@ std::string Sexy::Evaluate(const std::string &theString, const DefinesMap &theDe anEvaluatedString.erase(anEvaluatedString.begin() + aPercentPos, anEvaluatedString.begin() + aSecondPercentPos + 1); - anEvaluatedString.insert( - anEvaluatedString.begin() + aPercentPos, aValue.begin(), aValue.begin() + aValue.length()); + anEvaluatedString.insert(anEvaluatedString.begin() + aPercentPos, aValue.begin(), + aValue.begin() + aValue.length()); } return anEvaluatedString; diff --git a/src/SexyAppFramework/DescParser.cpp b/src/SexyAppFramework/DescParser.cpp index 2deee534..de8dac9e 100644 --- a/src/SexyAppFramework/DescParser.cpp +++ b/src/SexyAppFramework/DescParser.cpp @@ -1,6 +1,7 @@ #include "DescParser.h" #include "../PakLib/PakInterface.h" #include "Common.h" +#include "Encoding.h" using namespace Sexy; @@ -288,10 +289,8 @@ bool DescParser::DataToDoubleVector(DataElement *theSource, DoubleVector *theDou return true; } -bool DescParser::ParseToList(const std::string &theString, - ListDataElement *theList, - bool expectListEnd, - int *theStringPos) +bool DescParser::ParseToList(std::string::const_iterator &it, const std::string::const_iterator &end, + ListDataElement *theList, bool expectListEnd) { bool inSingleQuotes = false; bool inDoubleQuotes = false; @@ -299,15 +298,10 @@ bool DescParser::ParseToList(const std::string &theString, SingleDataElement *aCurSingleDataElement = NULL; - int aStringPos = 0; - - if (theStringPos == NULL) - theStringPos = &aStringPos; - - while (*theStringPos < (int)theString.length()) + while (it != end) { bool addSingleChar = false; - char aChar = theString[(*theStringPos)++]; + uint32_t aChar = utf8::next(it, end); // really a Codepoint bool isSeperator = (aChar == ' ') || (aChar == '\t') || (aChar == '\n') || (aChar == ','); @@ -350,7 +344,7 @@ bool DescParser::ParseToList(const std::string &theString, { ListDataElement *aChildList = new ListDataElement(); - if (!ParseToList(theString, aChildList, true, theStringPos)) + if (!ParseToList(it, end, aChildList, true)) return false; theList->mElementVector.push_back(aChildList); @@ -376,7 +370,7 @@ bool DescParser::ParseToList(const std::string &theString, theList->mElementVector.push_back(aCurSingleDataElement); } - aCurSingleDataElement->mString += aChar; + utf8::append(aChar, aCurSingleDataElement->mString); } } @@ -404,7 +398,8 @@ bool DescParser::ParseToList(const std::string &theString, bool DescParser::ParseDescriptorLine(const std::string &theDescriptorLine) { ListDataElement aParams; - if (!ParseToList(theDescriptorLine, &aParams, false, NULL)) + auto it = theDescriptorLine.begin(); + if (!ParseToList(it, theDescriptorLine.end(), &aParams, false)) return false; if (aParams.mElementVector.size() > 0) @@ -433,15 +428,40 @@ bool DescParser::LoadDescriptor(const std::string &theFileName) mError.erase(); mError.erase(mError.begin()); - PFILE *aStream = p_fopen(theFileName.c_str(), "r"); - if (aStream == NULL) + PFILE *pFile = p_fopen(theFileName.c_str(), "rb"); + if (pFile == nullptr) + { + return false; + } + p_fseek(pFile, 0, SEEK_END); + auto aSize = p_ftell(pFile); + p_fseek(pFile, 0, SEEK_SET); + + if (aSize <= 0) + { + p_fclose(pFile); return false; + } - char aBuffChar = 0; + std::string aBytes; + aBytes.resize(aSize); - while (!p_feof(aStream)) + std::size_t aReadSize = p_fread(&aBytes[0], sizeof(char), aSize, pFile); + p_fclose(pFile); + + if (aReadSize == 0) + return false; + + auto aDecoded = Sexy::ConvertToUtf8IfNeeded(aBytes); + const auto &aStream = aDecoded ? *aDecoded : aBytes; + auto it = aStream.begin(); + auto end = aStream.end(); + + uint32_t aBuffChar = 0; // really a Codepoint + + while (it != end) { - int aChar; + uint32_t aChar; // really a Codepoint bool skipLine = false; bool atLineStart = true; @@ -459,9 +479,11 @@ bool DescParser::LoadDescriptor(const std::string &theFileName) } else { - aChar = p_fgetc(aStream); - if (aChar == EOF) + if (it == end) + { break; + } + aChar = utf8::next(it, end); } if (aChar != '\r') @@ -524,13 +546,12 @@ bool DescParser::LoadDescriptor(const std::string &theFileName) if (mCurrentLine.size() == 0) mCurrentLineNum = aLineCount + 1; - mCurrentLine += aChar; + utf8::append(aChar, mCurrentLine); } } } } } - mCurrentLine = ANSIToUTF8(mCurrentLine); //fix encoding cause fuck windows!!! if (mCurrentLine.length() > 0) { @@ -551,6 +572,5 @@ bool DescParser::LoadDescriptor(const std::string &theFileName) mCurrentLine.erase(); mCurrentLineNum = 0; - p_fclose(aStream); return !hasErrors; } diff --git a/src/SexyAppFramework/DescParser.h b/src/SexyAppFramework/DescParser.h index 356f2c05..735282b7 100644 --- a/src/SexyAppFramework/DescParser.h +++ b/src/SexyAppFramework/DescParser.h @@ -85,7 +85,8 @@ class DescParser bool DataToList(DataElement *theSource, ListDataElement *theValues); bool DataToIntVector(DataElement *theSource, IntVector *theIntVector); bool DataToDoubleVector(DataElement *theSource, DoubleVector *theDoubleVector); - bool ParseToList(const std::string &theString, ListDataElement *theList, bool expectListEnd, int *theStringPos); + bool ParseToList(std::string::const_iterator &it, const std::string::const_iterator &end, ListDataElement *theList, + bool expectListEnd); bool ParseDescriptorLine(const std::string &theDescriptorLine); // You must implement this one diff --git a/src/SexyAppFramework/Encoding.cpp b/src/SexyAppFramework/Encoding.cpp new file mode 100644 index 00000000..8afa43d3 --- /dev/null +++ b/src/SexyAppFramework/Encoding.cpp @@ -0,0 +1,88 @@ +#include "Encoding.h" +#include "Common.h" + +namespace Sexy +{ +// Convert a UTF-16 byte buffer to UTF-8. Return nullopt if input is not valid UTF-16 +std::optional TryUtf16ToUtf8(const unsigned char *theBytes, std::size_t theByteCount, bool isBigEndian) +{ + int aUnitCount = theByteCount / 2; + std::u16string aWide; + aWide.reserve(aUnitCount); + for (int i = 0; i < aUnitCount; i++) + { + auto aFirst = theBytes[i * 2]; + auto aSecond = theBytes[i * 2 + 1]; + char16_t aUnit = isBigEndian ? (char16_t)((aFirst << 8) | aSecond) : (char16_t)((aSecond << 8) | aFirst); + aWide.push_back(aUnit); + } + + try + { + std::string aResult; + utf8::utf16to8(aWide.begin(), aWide.end(), std::back_inserter(aResult)); + return std::move(aResult); + } + catch (...) + { + return {}; + } +} + +// Decode raw file bytes to UTF-8, auto-detecting the encoding and using ANSI +// as a fallback. Returning nullopt means the input is already valid UTF8 without BOM +std::optional ConvertToUtf8IfNeeded(const std::string &theRaw) +{ + auto aSize = theRaw.size(); + auto aBytes = reinterpret_cast(theRaw.data()); + + // Try UTF-8 first - modern devs should use this + if (aSize >= 3 && aBytes[0] == 0xEF && aBytes[1] == 0xBB && aBytes[2] == 0xBF) + { + // strip BOM if present: EF BB BF + // - used by font descriptors in chinese versions + std::string aUtf8 = theRaw.substr(3); + if (utf8::is_valid(aUtf8)) + { + return aUtf8; + } + return ANSIToUTF8(theRaw); + } + + // UTF-8 without BOM + if (utf8::is_valid(theRaw)) + { + return {}; + } + + // Try UTF-16 + if (aSize >= 2) + { // need at least 2 bytes to make one char16 + // UTF-16 with BOM + if (aBytes[0] == 0xFF && aBytes[1] == 0xFE) + { + // LE - the chinese version of the original game uses this + auto aMaybeUtf16 = TryUtf16ToUtf8(aBytes + 2, aSize - 2, false); + if (aMaybeUtf16) + { + return std::move(*aMaybeUtf16); + } + } + + if (aBytes[0] == 0xFE && aBytes[1] == 0xFF) + { + // BE + auto aMaybeUtf16 = TryUtf16ToUtf8(aBytes + 2, aSize - 2, true); + if (aMaybeUtf16) + { + return std::move(*aMaybeUtf16); + } + } + + // we don't handle UTF-16 without BOM for now + } + + // Fallback - this is what the english version uses + return ANSIToUTF8(theRaw); +} +} // namespace Sexy diff --git a/src/SexyAppFramework/Encoding.h b/src/SexyAppFramework/Encoding.h new file mode 100644 index 00000000..8f4b8c97 --- /dev/null +++ b/src/SexyAppFramework/Encoding.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include + +namespace Sexy +{ + +// Convert a UTF-16 byte buffer to UTF-8. Return nullopt if input is not valid UTF-16 +std::optional TryUtf16ToUtf8(const unsigned char *theBytes, std::size_t theByteCount, bool isBigEndian); + +// Convert a buffer to UTF-8 (wihout BOM) if it's not already. Return nullopt if the input is already UTF-8 (without BOM) +// Supports UTF-16 LE/BE with BOM. Fallback to ANSI +std::optional ConvertToUtf8IfNeeded(const std::string &theRaw); + +} // namespace Sexy diff --git a/src/SexyAppFramework/Font.cpp b/src/SexyAppFramework/Font.cpp index d3576cd7..30baea96 100644 --- a/src/SexyAppFramework/Font.cpp +++ b/src/SexyAppFramework/Font.cpp @@ -58,7 +58,9 @@ int Font::StringWidth(const SexyString &theString) int Font::CharWidth(SexyChar theChar) { - SexyString aString(1, theChar); + // theChar is really a Codepoint (see derived class StringWidth implementations) + std::string aString; + utf8::append(theChar, std::back_inserter(aString)); return StringWidth(aString); } diff --git a/src/SexyAppFramework/ImageFont.cpp b/src/SexyAppFramework/ImageFont.cpp index 74172e81..97d1214f 100644 --- a/src/SexyAppFramework/ImageFont.cpp +++ b/src/SexyAppFramework/ImageFont.cpp @@ -969,6 +969,10 @@ bool FontData::HandleCommand(const ListDataElement &theParams) else invalidNumParams = true; } + else if (stricmp(aCmd.c_str(), "LayerSetExInfo") == 0) + { + // used in chinese versions, not implemented for now + } else { Error("Unknown Command"); diff --git a/src/main.cpp b/src/main.cpp index 8e075ff1..58257734 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -1,5 +1,4 @@ #include "LawnApp.h" -#include "Resources.h" #include "Sexy.TodLib/TodStringFile.h" using namespace Sexy; @@ -27,4 +26,4 @@ int main(int argc, char **argv) delete gLawnApp; return 0; -}; \ No newline at end of file +};