Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
333 changes: 310 additions & 23 deletions src/devkit/util/Strings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,17 +2,141 @@

#include "TypeCasts.hh"

// TODO: not rely on Windows API
// for ::MultiByteToWideChar and ::WideCharToMultiByte
#define WIN32_LEAN_AND_MEAN
#include <Windows.h>

namespace ra {

static size_t CalculateUtf8Length(const wchar_t* sText, size_t nTextLength, bool& bAsciiOnlyOut) noexcept
{
if (!sText || !nTextLength)
{
bAsciiOnlyOut = true;
return 0;
}

bool bAsciiOnly = true;
size_t nUtf8Length = 0;

GSL_SUPPRESS_TYPE1 const uint16_t* pSrc = reinterpret_cast<const uint16_t*>(sText);
if (pSrc)
{
const uint16_t* pStop = pSrc + nTextLength;
while (pSrc < pStop)
{
const auto c = *pSrc++;
if (c < 0x80)
{
++nUtf8Length;
continue;
}

bAsciiOnly = false;

if (c < 0x800)
{
nUtf8Length += 2;
}
else if (c >= 0xD800 && c <= 0xDBFF && *pSrc >= 0xDC00 && *pSrc <= 0xDFFF)
{
// two surrogate pairs will take four bytes total
nUtf8Length += 4;
++pSrc;
}
else
{
nUtf8Length += 3;
}
}
}

bAsciiOnlyOut = bAsciiOnly;
return nUtf8Length;
}

static std::string Narrow(const wchar_t* sText, size_t sTextLength)
{
if (!sText)
return {};

bool bAsciiOnly = true;
const auto nUtf8Length = CalculateUtf8Length(sText, sTextLength, bAsciiOnly);

std::string sResult;
sResult.resize(nUtf8Length);
GSL_SUPPRESS_TYPE1 uint8_t* pOut = reinterpret_cast<uint8_t*>(sResult.data());

GSL_SUPPRESS_TYPE1 const uint16_t* pSrc = reinterpret_cast<const uint16_t*>(sText);
const uint16_t* pStop = pSrc + sTextLength;

if (!pOut || !pSrc)
return sResult;

if (bAsciiOnly)
{
while (pSrc < pStop)
*pOut++ = gsl::narrow_cast<uint8_t>(*pSrc++);
*pOut = '\0';
return sResult;
}

while (pSrc < pStop)
{
uint16_t c = *pSrc++;
if (c < 0x80)
{
*pOut++ = gsl::narrow_cast<uint8_t>(c);
continue;
}

if (c < 0x800)
{
*pOut++ = 0xC0 | gsl::narrow_cast<uint8_t>(c >> 6);
*pOut++ = 0x80 | gsl::narrow_cast<uint8_t>(c & 0x3F);
continue;
}

if (c >= 0xD800 && c <= 0xDFFF)
{
if (c >= 0xDC00)
{
// second part of surrogate pair after first
c = 0xFFFD;
}
else
{
const uint16_t c2 = *pSrc;
if (c2 < 0xDC00 || c2 > 0xDFFF)
{
// not second part of surrogate pair
c = 0xFFFD;
}
else
{
// decode surrogate pair
const uint32_t cx = (((c & 0x03FF) << 10) | (c2 & 0x3FF)) + 0x10000;
*pOut++ = 0xF0 | gsl::narrow_cast<uint8_t>(cx >> 18);
*pOut++ = 0x80 | gsl::narrow_cast<uint8_t>((cx >> 12) & 0x3F);
*pOut++ = 0x80 | gsl::narrow_cast<uint8_t>((cx >> 6) & 0x3F);
*pOut++ = 0x80 | gsl::narrow_cast<uint8_t>(cx & 0x3F);
++pSrc;
continue;
}
}
}

*pOut++ = 0xE0 | (c >> 12);
*pOut++ = 0x80 | ((c >> 6) & 0x3F);
*pOut++ = 0x80 | (c & 0x3F);
}

*pOut = '\0';
GSL_SUPPRESS_TYPE1 const auto nActualSize = pOut - reinterpret_cast<uint8_t*>(sResult.data());
sResult.resize(nActualSize);
return sResult;
}

_Use_decl_annotations_
std::string Narrow(const std::wstring& wstr)
{
return Narrow(wstr.c_str());
return Narrow(wstr.c_str(), wstr.length());
}

std::string Narrow(std::wstring&& wstr) noexcept
Expand All @@ -24,20 +148,191 @@ std::string Narrow(std::wstring&& wstr) noexcept
_Use_decl_annotations_
std::string Narrow(const wchar_t* wstr)
{
const auto len = gsl::narrow_cast<int>(std::wcslen(wstr));
const auto needed = ::WideCharToMultiByte(CP_UTF8, 0, wstr, len + 1, nullptr, 0, nullptr, nullptr);
return Narrow(wstr, wcslen(wstr));
}

std::string str(ra::to_unsigned(needed), '\000'); // allocate required space (including terminator)
::WideCharToMultiByte(CP_UTF8, 0, wstr, len + 1, str.data(), gsl::narrow_cast<unsigned int>(str.capacity()),
nullptr, nullptr);
str.resize(ra::to_unsigned(needed - 1)); // terminator is not actually part of the string
return str;
// https://en.wikipedia.org/wiki/UTF-8#Byte_map
static const uint8_t UTF_NUM_TRAIL_BYTES[128] =
{
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x80-0x8f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0x90-0x9f
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xa0-0xaf
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 0xb0-0xbf
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0xc0-0xcf
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // 0xd0-0xdf
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // 0xe0-0xef
3,3,3,3,3,3,3,3,4,4,4,4,5,5,0,0 // 0xf0-0xff
};

static size_t CalculateUnicodeLength(const char* sText, size_t nTextLength, bool& bAsciiOnlyOut) noexcept
{
if (!sText || !nTextLength)
{
bAsciiOnlyOut = true;
return 0;
}

bool bAsciiOnly = true;

size_t nUnicodeLength = 0;
size_t nRemaining = nTextLength;
GSL_SUPPRESS_TYPE1 const uint64_t* p64 = reinterpret_cast<const uint64_t*>(sText);
if (p64)
{
while (nRemaining > 8)
{
// if any byte is not an ASCII character, switch to the slower algorithm
if (*p64 & 0x8080808080808080)
break;

++p64;
nUnicodeLength += 8;
nRemaining -= 8;
}
}

GSL_SUPPRESS_TYPE1 const uint8_t* p8 = reinterpret_cast<const uint8_t*>(p64);
if (p8)
{
while (nRemaining)
{
--nRemaining;
++nUnicodeLength;

const uint8_t c = *p8++;
if ((c & 0x80) == 0)
{
// single byte character
continue;
}

bAsciiOnly = false;

if ((c & 0x40) == 0)
{
// trail byte without lead
continue;
}

GSL_SUPPRESS_BOUNDS4 const auto nAdditional = UTF_NUM_TRAIL_BYTES[c & 0x7F];
if (nAdditional > nRemaining) // not enough remaining
break;

if (nAdditional > 3)
{
// 5/6 byte UTF-8 sequences not supported in UTF-16 - will be replaced with xFFFD
}
else if (nAdditional == 3)
{
// extra space for surrogate pair
++nUnicodeLength;
}

nRemaining -= nAdditional;
}
}

bAsciiOnlyOut = bAsciiOnly;
return nUnicodeLength;
}

static std::wstring Widen(const char* sText, size_t nTextLength)
{
if (!sText)
return {};

bool bAsciiOnly = true;
const auto nUnicodeLength = CalculateUnicodeLength(sText, nTextLength, bAsciiOnly);

std::wstring sResult;
sResult.resize(nUnicodeLength);
GSL_SUPPRESS_TYPE1 uint16_t* pOut = reinterpret_cast<uint16_t*>(sResult.data());

GSL_SUPPRESS_TYPE1 const uint8_t* pSrc = reinterpret_cast<const uint8_t*>(sText);
const uint8_t* pStop = pSrc + nTextLength;

if (!pOut || !pSrc)
return sResult;

if (bAsciiOnly)
{
while (pSrc < pStop)
*pOut++ = gsl::narrow_cast<uint16_t>(*pSrc++);
*pOut = '\0';
return sResult;
}

while (pSrc < pStop)
{
const uint8_t c = *pSrc++;
if ((c & 0x80) == 0)
{
*pOut++ = gsl::narrow_cast<uint16_t>(c);
continue;
}

if ((c & 0xC0) == 0x80)
{
// trail byte
*pOut++ = 0xFFFD;
continue;
}

GSL_SUPPRESS_BOUNDS4 auto nAdditional = UTF_NUM_TRAIL_BYTES[c & 0x7F];
if (pSrc + nAdditional > pStop)
{
// not enough data
*pOut++ = 0xFFFD;
break;
}

uint32_t nAccumulator = gsl::narrow_cast<uint32_t>(c);

// 1 additional -> 0x1F, 2 -> 0x0F, 3 -> 0x07, 4 -> 0x03, 5 -> 0x01
nAccumulator &= (1 << (6 - nAdditional)) - 1;

bool bInvalid = false;
while (nAdditional)
{
const uint8_t c2 = *pSrc++;
bInvalid |= ((c2 & 0xC0) != 0x80);
nAccumulator <<= 6;
nAccumulator |= (c2 & 0x3F);
--nAdditional;
}

if (bInvalid)
{
*pOut++ = 0xFFFD;
}
else if (nAccumulator < 0xFFFF)
{
*pOut++ = gsl::narrow_cast<uint16_t>(nAccumulator & 0xFFFF);
}
else if (nAccumulator < 0x110000)
{
// convert to surrogate pair
nAccumulator -= 0x10000;
*pOut++ = 0xD800 | gsl::narrow_cast<uint16_t>(nAccumulator >> 10);
*pOut++ = 0xDC00 | gsl::narrow_cast<uint16_t>(nAccumulator & 0x03FF);
}
else
{
// 5/6 byte UTF-8 characters not supported by UTF-16.
*pOut++ = 0xFFFD;
}
}

*pOut = '\0';
GSL_SUPPRESS_TYPE1 const auto nActualSize = pOut - reinterpret_cast<uint16_t*>(sResult.data());
sResult.resize(nActualSize);
return sResult;
}

_Use_decl_annotations_
std::wstring Widen(const std::string& str)
{
return Widen(str.c_str());
return Widen(str.c_str(), str.length());
}

std::wstring Widen(std::string&& str) noexcept
Expand All @@ -49,15 +344,7 @@ std::wstring Widen(std::string&& str) noexcept
_Use_decl_annotations_
std::wstring Widen(const char* str)
{
const auto len = gsl::narrow_cast<int>(std::strlen(str));
const auto needed = ::MultiByteToWideChar(CP_UTF8, 0, str, len + 1, nullptr, 0);
// doesn't seem wchar_t is treated like a character type by default
std::wstring wstr(ra::to_unsigned(needed), L'\x0');
::MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, len + 1, wstr.data(),
gsl::narrow_cast<unsigned int>(wstr.capacity()));
wstr.resize(ra::to_unsigned(needed - 1));

return wstr;
return Widen(str, strlen(str));
}

_Use_decl_annotations_
Expand Down
1 change: 1 addition & 0 deletions tests/devkit/util/Strings_Tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ TEST_CLASS(RA_StringUtils_Tests)
Assert::AreEqual(std::wstring(L"Test"), Widen(L"Test"));
Assert::AreEqual(std::wstring(L"Test"), Widen(std::string("Test")));
Assert::AreEqual(std::wstring(L"Test"), Widen(std::wstring(L"Test")));
Assert::AreEqual(std::wstring(L"ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()-=_+"), Widen(std::string("ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()-=_+")));

// U+1F30F - EARTH GLOBE ASIA-AUSTRALIA
Assert::AreEqual(std::wstring(L"\xD83C\xDF0F"), Widen("\xF0\x9F\x8C\x8F"));
Expand Down
Loading