RetroAchievements · Jamiras · Dec 20, 2025 · Dec 13, 2025 · Dec 13, 2025
diff --git a/src/devkit/util/Strings.cpp b/src/devkit/util/Strings.cpp
@@ -2,17 +2,141 @@
 
 #include "TypeCasts.hh"
 
-// TODO: not rely on Windows API
-// for ::MultiByteToWideChar and ::WideCharToMultiByte
-#define WIN32_LEAN_AND_MEAN
-#include <Windows.h>
-
 namespace ra {
 
+static size_t CalculateUtf8Length(const wchar_t* sText, size_t nTextLength, bool& bAsciiOnlyOut) noexcept
+{
+    if (!sText || !nTextLength)
+    {
+        bAsciiOnlyOut = true;
+        return 0;
+    }
+
+    bool bAsciiOnly = true;
+    size_t nUtf8Length = 0;
+
+    GSL_SUPPRESS_TYPE1 const uint16_t* pSrc = reinterpret_cast<const uint16_t*>(sText);
+    if (pSrc)
+    {
+        const uint16_t* pStop = pSrc + nTextLength;
+        while (pSrc < pStop)
+        {
+            const auto c = *pSrc++;
+            if (c < 0x80)
+            {
+                ++nUtf8Length;
+                continue;
+            }
+
+            bAsciiOnly = false;
+
+            if (c < 0x800)
+            {
+                nUtf8Length += 2;
+            }
+            else if (c >= 0xD800 && c <= 0xDBFF && *pSrc >= 0xDC00 && *pSrc <= 0xDFFF)
+            {
+                // two surrogate pairs will take four bytes total
+                nUtf8Length += 4;
+                ++pSrc;
+            }
+            else
+            {
+                nUtf8Length += 3;
+            }
+        }
+    }
+
+    bAsciiOnlyOut = bAsciiOnly;
+    return nUtf8Length;
+}
+
+static std::string Narrow(const wchar_t* sText, size_t sTextLength)
+{
+    if (!sText)
+        return {};
+
+    bool bAsciiOnly = true;
+    const auto nUtf8Length = CalculateUtf8Length(sText, sTextLength, bAsciiOnly);
+
+    std::string sResult;
+    sResult.resize(nUtf8Length);
+    GSL_SUPPRESS_TYPE1 uint8_t* pOut = reinterpret_cast<uint8_t*>(sResult.data());
+
+    GSL_SUPPRESS_TYPE1 const uint16_t* pSrc = reinterpret_cast<const uint16_t*>(sText);
+    const uint16_t* pStop = pSrc + sTextLength;
+
+    if (!pOut || !pSrc)
+        return sResult;
+
+    if (bAsciiOnly)
+    {
+        while (pSrc < pStop)
+            *pOut++ = gsl::narrow_cast<uint8_t>(*pSrc++);
+        *pOut = '\0';
+        return sResult;
+    }
+
+    while (pSrc < pStop)
+    {
+        uint16_t c = *pSrc++;
+        if (c < 0x80)
+        {
+            *pOut++ = gsl::narrow_cast<uint8_t>(c);
+            continue;
+        }
+
+        if (c < 0x800)
+        {
+            *pOut++ = 0xC0 | gsl::narrow_cast<uint8_t>(c >> 6);
+            *pOut++ = 0x80 | gsl::narrow_cast<uint8_t>(c & 0x3F);
+            continue;
+        }
+
+        if (c >= 0xD800 && c <= 0xDFFF)
+        {
+            if (c >= 0xDC00)
+            {
+                // second part of surrogate pair after first
+                c = 0xFFFD;
+            }
+            else
+            {
+                const uint16_t c2 = *pSrc;
+                if (c2 < 0xDC00 || c2 > 0xDFFF)
+                {
+                    // not second part of surrogate pair
+                    c = 0xFFFD;
+                }
+                else
+                {
+                    // decode surrogate pair
+                    const uint32_t cx = (((c & 0x03FF) << 10) | (c2 & 0x3FF)) + 0x10000;
+                    *pOut++ = 0xF0 | gsl::narrow_cast<uint8_t>(cx >> 18);
+                    *pOut++ = 0x80 | gsl::narrow_cast<uint8_t>((cx >> 12) & 0x3F);
+                    *pOut++ = 0x80 | gsl::narrow_cast<uint8_t>((cx >> 6) & 0x3F);
+                    *pOut++ = 0x80 | gsl::narrow_cast<uint8_t>(cx & 0x3F);
+                    ++pSrc;
+                    continue;
+                }
+            }
+        }
+
+        *pOut++ = 0xE0 | (c >> 12);
+        *pOut++ = 0x80 | ((c >> 6) & 0x3F);
+        *pOut++ = 0x80 | (c & 0x3F);
+    }
+
+    *pOut = '\0';
+    GSL_SUPPRESS_TYPE1 const auto nActualSize = pOut - reinterpret_cast<uint8_t*>(sResult.data());
+    sResult.resize(nActualSize);
+    return sResult;
+}
+
 _Use_decl_annotations_
 std::string Narrow(const std::wstring& wstr)
 {
-    return Narrow(wstr.c_str());
+    return Narrow(wstr.c_str(), wstr.length());
 }
 
 std::string Narrow(std::wstring&& wstr) noexcept
@@ -24,20 +148,191 @@ std::string Narrow(std::wstring&& wstr) noexcept
 _Use_decl_annotations_
 std::string Narrow(const wchar_t* wstr)
 {
-    const auto len = gsl::narrow_cast<int>(std::wcslen(wstr));
-    const auto needed = ::WideCharToMultiByte(CP_UTF8, 0, wstr, len + 1, nullptr, 0, nullptr, nullptr);
+    return Narrow(wstr, wcslen(wstr));
+}
 
-    std::string str(ra::to_unsigned(needed), '\000'); // allocate required space (including terminator)
-    ::WideCharToMultiByte(CP_UTF8, 0, wstr, len + 1, str.data(), gsl::narrow_cast<unsigned int>(str.capacity()),
-        nullptr, nullptr);
-    str.resize(ra::to_unsigned(needed - 1)); // terminator is not actually part of the string
-    return str;
+// https://en.wikipedia.org/wiki/UTF-8#Byte_map
+static const uint8_t UTF_NUM_TRAIL_BYTES[128] =
+{
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x80-0x8f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x90-0x9f
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xa0-0xaf
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xb0-0xbf
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0xc0-0xcf
+    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0xd0-0xdf
+    2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xe0-0xef
+    3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0  // 0xf0-0xff
+};
+
+static size_t CalculateUnicodeLength(const char* sText, size_t nTextLength, bool& bAsciiOnlyOut) noexcept
+{
+    if (!sText || !nTextLength)
+    {
+        bAsciiOnlyOut = true;
+        return 0;
+    }
+
+    bool bAsciiOnly = true;
+
+    size_t nUnicodeLength = 0;
+    size_t nRemaining = nTextLength;
+    GSL_SUPPRESS_TYPE1 const uint64_t* p64 = reinterpret_cast<const uint64_t*>(sText);
+    if (p64)
+    {
+        while (nRemaining > 8)
+        {
+            // if any byte is not an ASCII character, switch to the slower algorithm
+            if (*p64 & 0x8080808080808080)
+                break;
+
+            ++p64;
+            nUnicodeLength += 8;
+            nRemaining -= 8;
+        }
+    }
+
+    GSL_SUPPRESS_TYPE1 const uint8_t* p8 = reinterpret_cast<const uint8_t*>(p64);
+    if (p8)
+    {
+        while (nRemaining)
+        {
+            --nRemaining;
+            ++nUnicodeLength;
+
+            const uint8_t c = *p8++;
+            if ((c & 0x80) == 0)
+            {
+                // single byte character
+                continue;
+            }
+
+            bAsciiOnly = false;
+
+            if ((c & 0x40) == 0)
+            {
+                // trail byte without lead
+                continue;
+            }
+
+            GSL_SUPPRESS_BOUNDS4 const auto nAdditional = UTF_NUM_TRAIL_BYTES[c & 0x7F];
+            if (nAdditional > nRemaining) // not enough remaining
+                break;
+
+            if (nAdditional > 3)
+            {
+                // 5/6 byte UTF-8 sequences not supported in UTF-16 - will be replaced with xFFFD
+            }
+            else if (nAdditional == 3)
+            {
+                // extra space for surrogate pair
+                ++nUnicodeLength;
+            }
+
+            nRemaining -= nAdditional;
+        }
+    }
+
+    bAsciiOnlyOut = bAsciiOnly;
+    return nUnicodeLength;
+}
+
+static std::wstring Widen(const char* sText, size_t nTextLength)
+{
+    if (!sText)
+        return {};
+
+    bool bAsciiOnly = true;
+    const auto nUnicodeLength = CalculateUnicodeLength(sText, nTextLength, bAsciiOnly);
+
+    std::wstring sResult;
+    sResult.resize(nUnicodeLength);
+    GSL_SUPPRESS_TYPE1 uint16_t* pOut = reinterpret_cast<uint16_t*>(sResult.data());
+
+    GSL_SUPPRESS_TYPE1 const uint8_t* pSrc = reinterpret_cast<const uint8_t*>(sText);
+    const uint8_t* pStop = pSrc + nTextLength;
+
+    if (!pOut || !pSrc)
+        return sResult;
+
+    if (bAsciiOnly)
+    {
+        while (pSrc < pStop)
+            *pOut++ = gsl::narrow_cast<uint16_t>(*pSrc++);
+        *pOut = '\0';
+        return sResult;
+    }
+
+    while (pSrc < pStop)
+    {
+        const uint8_t c = *pSrc++;
+        if ((c & 0x80) == 0)
+        {
+            *pOut++ = gsl::narrow_cast<uint16_t>(c);
+            continue;
+        }
+
+        if ((c & 0xC0) == 0x80)
+        {
+            // trail byte
+            *pOut++ = 0xFFFD;
+            continue;
+        }
+
+        GSL_SUPPRESS_BOUNDS4 auto nAdditional = UTF_NUM_TRAIL_BYTES[c & 0x7F];
+        if (pSrc + nAdditional > pStop)
+        {
+            // not enough data
+            *pOut++ = 0xFFFD;
+            break;
+        }
+
+        uint32_t nAccumulator = gsl::narrow_cast<uint32_t>(c);
+
+        // 1 additional -> 0x1F, 2 -> 0x0F, 3 -> 0x07, 4 -> 0x03, 5 -> 0x01
+        nAccumulator &= (1 << (6 - nAdditional)) - 1;
+
+        bool bInvalid = false;
+        while (nAdditional)
+        {
+            const uint8_t c2 = *pSrc++;
+            bInvalid |= ((c2 & 0xC0) != 0x80);
+            nAccumulator <<= 6;
+            nAccumulator |= (c2 & 0x3F);
+            --nAdditional;
+        }
+
+        if (bInvalid)
+        {
+            *pOut++ = 0xFFFD;
+        }
+        else if (nAccumulator < 0xFFFF)
+        {
+            *pOut++ = gsl::narrow_cast<uint16_t>(nAccumulator & 0xFFFF);
+        }
+        else if (nAccumulator < 0x110000)
+        {
+            // convert to surrogate pair
+            nAccumulator -= 0x10000;
+            *pOut++ = 0xD800 | gsl::narrow_cast<uint16_t>(nAccumulator >> 10);
+            *pOut++ = 0xDC00 | gsl::narrow_cast<uint16_t>(nAccumulator & 0x03FF);
+        }
+        else
+        {
+            // 5/6 byte UTF-8 characters not supported by UTF-16.
+            *pOut++ = 0xFFFD;
+        }
+    }
+
+    *pOut = '\0';
+    GSL_SUPPRESS_TYPE1 const auto nActualSize = pOut - reinterpret_cast<uint16_t*>(sResult.data());
+    sResult.resize(nActualSize);
+    return sResult;
 }
 
 _Use_decl_annotations_
 std::wstring Widen(const std::string& str)
 {
-    return Widen(str.c_str());
+    return Widen(str.c_str(), str.length());
 }
 
 std::wstring Widen(std::string&& str) noexcept
@@ -49,15 +344,7 @@ std::wstring Widen(std::string&& str) noexcept
 _Use_decl_annotations_
 std::wstring Widen(const char* str)
 {
-    const auto len = gsl::narrow_cast<int>(std::strlen(str));
-    const auto needed = ::MultiByteToWideChar(CP_UTF8, 0, str, len + 1, nullptr, 0);
-    // doesn't seem wchar_t is treated like a character type by default
-    std::wstring wstr(ra::to_unsigned(needed), L'\x0');
-    ::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, len + 1, wstr.data(),
-        gsl::narrow_cast<unsigned int>(wstr.capacity()));
-    wstr.resize(ra::to_unsigned(needed - 1));
-
-    return wstr;
+    return Widen(str, strlen(str));
 }
 
 _Use_decl_annotations_

diff --git a/tests/devkit/util/Strings_Tests.cpp b/tests/devkit/util/Strings_Tests.cpp
@@ -46,6 +46,7 @@ TEST_CLASS(RA_StringUtils_Tests)
         Assert::AreEqual(std::wstring(L"Test"), Widen(L"Test"));
         Assert::AreEqual(std::wstring(L"Test"), Widen(std::string("Test")));
         Assert::AreEqual(std::wstring(L"Test"), Widen(std::wstring(L"Test")));
+        Assert::AreEqual(std::wstring(L"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()-=_+"), Widen(std::string("ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()-=_+")));
 
         // U+1F30F - EARTH GLOBE ASIA-AUSTRALIA
         Assert::AreEqual(std::wstring(L"\xD83C\xDF0F"), Widen("\xF0\x9F\x8C\x8F"));