Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/generators/genlua.ml
Original file line number Diff line number Diff line change
Expand Up @@ -2115,6 +2115,10 @@ let generate com =
newline ctx
);

(* UTF-8 shim: pre-populate package.loaded before @:luaRequire generates require *)
if not ctx.lua_vanilla then
print_file (find_file "lua/_lua/_hx_utf8.lua");

List.iter (generate_type_forward ctx) com.types; newline ctx;

(* Generate some dummy placeholders for utility libs that may be required*)
Expand Down
104 changes: 104 additions & 0 deletions std/lua/_lua/_hx_utf8.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
-- UTF-8 compatibility shim: pre-populate package.loaded['lua-utf8']
-- so that @:luaRequire('lua-utf8') finds it via the normal require mechanism.
--
-- Priority: real lua-utf8 lib > built-in utf8 module (Lua 5.3+) > do nothing
--
-- Limitations when using built-in utf8 fallback:
-- upper/lower are ASCII-only
-- gsub/gmatch/match operate on bytes, not characters

local hasLuaUtf8, luaUtf8 = pcall(require, 'lua-utf8')
if not hasLuaUtf8 then
local hasBuiltinUtf8, builtinUtf8 = pcall(require, 'utf8')
if hasBuiltinUtf8 then
local compat = {}

-- len(s, i, j, lax)
-- Built-in utf8.len does not support the lax parameter.
-- genlua.ml hardcodes: __lua_lib_luautf8_Utf8.len(s, nil, nil, true)
-- When lax is true and utf8.len fails (invalid UTF-8), fall back to #s.
compat.len = function(s, i, j, lax)
i = i or 1
j = j or -1
local result, err = builtinUtf8.len(s, i, j)
if result then
return result
elseif lax then
return #s
else
return nil, err
end
end

-- char(...) maps directly
compat.char = builtinUtf8.char

-- codes(s) maps directly
compat.codes = builtinUtf8.codes

-- byte(s, i) - return codepoint at character position i (1-based)
-- Built-in utf8.codepoint takes byte positions, so convert first.
-- utf8.offset(s,0) has special semantics (finds char at byte pos), so reject 0.
compat.byte = function(s, i)
i = i or 1
if i == 0 then return nil end
local ok, bytePos = pcall(builtinUtf8.offset, s, i)
if not ok or not bytePos or bytePos > #s then return nil end
return builtinUtf8.codepoint(s, bytePos)
end

-- sub(s, i, j) - substring by character positions (1-based, inclusive)
-- Convert character positions to byte positions, then use string.sub.
compat.sub = function(s, i, j)
j = j or -1
local len = builtinUtf8.len(s)
if not len then return s:sub(i, j) end

-- normalize negative indices
if i < 0 then i = len + i + 1 end
if j < 0 then j = len + j + 1 end

-- clamp
if i < 1 then i = 1 end
if j > len then j = len end
if i > j then return "" end

local byteStart = builtinUtf8.offset(s, i)
-- end of character j = start of character j+1 minus 1
local byteEnd
if j >= len then
byteEnd = #s
else
byteEnd = builtinUtf8.offset(s, j + 1) - 1
end

return s:sub(byteStart, byteEnd)
end

-- find(s, pat, init, plain) - convert init and results between char/byte positions
compat.find = function(s, pat, init, plain)
local byteInit = nil
if init then
byteInit = builtinUtf8.offset(s, init)
if not byteInit then return nil end -- init past end of string
end
local byteStart, byteEnd = string.find(s, pat, byteInit, plain)
if not byteStart then return nil end
-- convert byte positions back to character positions
local charStart = builtinUtf8.len(s, 1, byteStart)
local charEnd = builtinUtf8.len(s, 1, byteEnd)
return charStart, charEnd
end

-- ASCII-only fallbacks
compat.upper = string.upper
compat.lower = string.lower

-- Byte-level fallbacks for pattern functions
compat.gsub = string.gsub
compat.gmatch = string.gmatch
compat.match = string.match

package.loaded['lua-utf8'] = compat
end
end
70 changes: 70 additions & 0 deletions tests/unit/src/unit/issues/Issue9412.hx
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
package unit.issues;

class Issue9412 extends unit.Test {
#if lua
function testLength() {
// ASCII
eq("hello".length, 5);
// Multi-byte: "héllo" has 5 characters but more bytes
eq("h\u00E9llo".length, 5);
// CJK: 3 characters
eq("\u4F60\u597D\u554A".length, 3);
eq("".length, 0);
}

function testCharAt() {
var s = "h\u00E9llo";
eq(s.charAt(0), "h");
eq(s.charAt(1), "\u00E9");
eq(s.charAt(2), "l");
eq(s.charAt(4), "o");
eq(s.charAt(10), "");
}

function testCharCodeAt() {
var s = "h\u00E9llo";
eq(s.charCodeAt(0), 0x68); // 'h'
eq(s.charCodeAt(1), 0xE9); // 'é'
eq(s.charCodeAt(2), 0x6C); // 'l'
}

function testIndexOf() {
var s = "h\u00E9llo";
eq(s.indexOf("h"), 0);
eq(s.indexOf("\u00E9"), 1);
eq(s.indexOf("llo"), 2);
eq(s.indexOf("z"), -1);
}

function testSubstring() {
var s = "h\u00E9llo";
eq(s.substring(0, 1), "h");
eq(s.substring(1, 2), "\u00E9");
eq(s.substring(0, 5), "h\u00E9llo");
eq(s.substring(2), "llo");
}

function testToUpperLowerCase() {
// ASCII cases always work
eq("hello".toUpperCase(), "HELLO");
eq("HELLO".toLowerCase(), "hello");
}

function testFromCharCode() {
eq(String.fromCharCode(0x68), "h");
eq(String.fromCharCode(0xE9), "\u00E9");
}

function testSplit() {
var parts = "a\u00E9b\u00E9c".split("\u00E9");
eq(parts.length, 3);
eq(parts[0], "a");
eq(parts[1], "b");
eq(parts[2], "c");
}
#else
function test() {
noAssert();
}
#end
}
Loading