HaxeFoundation · jdonaldson · Feb 11, 2026
diff --git a/src/generators/genlua.ml b/src/generators/genlua.ml
@@ -2115,6 +2115,10 @@ let generate com =
          newline ctx
     );
 
+    (* UTF-8 shim: pre-populate package.loaded before @:luaRequire generates require *)
+    if not ctx.lua_vanilla then
+        print_file (find_file "lua/_lua/_hx_utf8.lua");
+
     List.iter (generate_type_forward ctx) com.types; newline ctx;
 
     (* Generate some dummy placeholders for utility libs that may be required*)

diff --git a/std/lua/_lua/_hx_utf8.lua b/std/lua/_lua/_hx_utf8.lua
@@ -0,0 +1,104 @@
+-- UTF-8 compatibility shim: pre-populate package.loaded['lua-utf8']
+-- so that @:luaRequire('lua-utf8') finds it via the normal require mechanism.
+--
+-- Priority: real lua-utf8 lib > built-in utf8 module (Lua 5.3+) > do nothing
+--
+-- Limitations when using built-in utf8 fallback:
+--   upper/lower are ASCII-only
+--   gsub/gmatch/match operate on bytes, not characters
+
+local hasLuaUtf8, luaUtf8 = pcall(require, 'lua-utf8')
+if not hasLuaUtf8 then
+  local hasBuiltinUtf8, builtinUtf8 = pcall(require, 'utf8')
+  if hasBuiltinUtf8 then
+    local compat = {}
+
+    -- len(s, i, j, lax)
+    -- Built-in utf8.len does not support the lax parameter.
+    -- genlua.ml hardcodes: __lua_lib_luautf8_Utf8.len(s, nil, nil, true)
+    -- When lax is true and utf8.len fails (invalid UTF-8), fall back to #s.
+    compat.len = function(s, i, j, lax)
+      i = i or 1
+      j = j or -1
+      local result, err = builtinUtf8.len(s, i, j)
+      if result then
+        return result
+      elseif lax then
+        return #s
+      else
+        return nil, err
+      end
+    end
+
+    -- char(...) maps directly
+    compat.char = builtinUtf8.char
+
+    -- codes(s) maps directly
+    compat.codes = builtinUtf8.codes
+
+    -- byte(s, i) - return codepoint at character position i (1-based)
+    -- Built-in utf8.codepoint takes byte positions, so convert first.
+    -- utf8.offset(s,0) has special semantics (finds char at byte pos), so reject 0.
+    compat.byte = function(s, i)
+      i = i or 1
+      if i == 0 then return nil end
+      local ok, bytePos = pcall(builtinUtf8.offset, s, i)
+      if not ok or not bytePos or bytePos > #s then return nil end
+      return builtinUtf8.codepoint(s, bytePos)
+    end
+
+    -- sub(s, i, j) - substring by character positions (1-based, inclusive)
+    -- Convert character positions to byte positions, then use string.sub.
+    compat.sub = function(s, i, j)
+      j = j or -1
+      local len = builtinUtf8.len(s)
+      if not len then return s:sub(i, j) end
+
+      -- normalize negative indices
+      if i < 0 then i = len + i + 1 end
+      if j < 0 then j = len + j + 1 end
+
+      -- clamp
+      if i < 1 then i = 1 end
+      if j > len then j = len end
+      if i > j then return "" end
+
+      local byteStart = builtinUtf8.offset(s, i)
+      -- end of character j = start of character j+1 minus 1
+      local byteEnd
+      if j >= len then
+        byteEnd = #s
+      else
+        byteEnd = builtinUtf8.offset(s, j + 1) - 1
+      end
+
+      return s:sub(byteStart, byteEnd)
+    end
+
+    -- find(s, pat, init, plain) - convert init and results between char/byte positions
+    compat.find = function(s, pat, init, plain)
+      local byteInit = nil
+      if init then
+        byteInit = builtinUtf8.offset(s, init)
+        if not byteInit then return nil end -- init past end of string
+      end
+      local byteStart, byteEnd = string.find(s, pat, byteInit, plain)
+      if not byteStart then return nil end
+      -- convert byte positions back to character positions
+      local charStart = builtinUtf8.len(s, 1, byteStart)
+      local charEnd = builtinUtf8.len(s, 1, byteEnd)
+      return charStart, charEnd
+    end
+
+    -- ASCII-only fallbacks
+    compat.upper = string.upper
+    compat.lower = string.lower
+
+    -- Byte-level fallbacks for pattern functions
+    compat.gsub = string.gsub
+    compat.gmatch = string.gmatch
+    compat.match = string.match
+
+    package.loaded['lua-utf8'] = compat
+  end
+end
diff --git a/tests/unit/src/unit/issues/Issue9412.hx b/tests/unit/src/unit/issues/Issue9412.hx
@@ -0,0 +1,70 @@
+package unit.issues;
+
+class Issue9412 extends unit.Test {
+#if lua
+	function testLength() {
+		// ASCII
+		eq("hello".length, 5);
+		// Multi-byte: "héllo" has 5 characters but more bytes
+		eq("h\u00E9llo".length, 5);
+		// CJK: 3 characters
+		eq("\u4F60\u597D\u554A".length, 3);
+		eq("".length, 0);
+	}
+
+	function testCharAt() {
+		var s = "h\u00E9llo";
+		eq(s.charAt(0), "h");
+		eq(s.charAt(1), "\u00E9");
+		eq(s.charAt(2), "l");
+		eq(s.charAt(4), "o");
+		eq(s.charAt(10), "");
+	}
+
+	function testCharCodeAt() {
+		var s = "h\u00E9llo";
+		eq(s.charCodeAt(0), 0x68); // 'h'
+		eq(s.charCodeAt(1), 0xE9); // 'é'
+		eq(s.charCodeAt(2), 0x6C); // 'l'
+	}
+
+	function testIndexOf() {
+		var s = "h\u00E9llo";
+		eq(s.indexOf("h"), 0);
+		eq(s.indexOf("\u00E9"), 1);
+		eq(s.indexOf("llo"), 2);
+		eq(s.indexOf("z"), -1);
+	}
+
+	function testSubstring() {
+		var s = "h\u00E9llo";
+		eq(s.substring(0, 1), "h");
+		eq(s.substring(1, 2), "\u00E9");
+		eq(s.substring(0, 5), "h\u00E9llo");
+		eq(s.substring(2), "llo");
+	}
+
+	function testToUpperLowerCase() {
+		// ASCII cases always work
+		eq("hello".toUpperCase(), "HELLO");
+		eq("HELLO".toLowerCase(), "hello");
+	}
+
+	function testFromCharCode() {
+		eq(String.fromCharCode(0x68), "h");
+		eq(String.fromCharCode(0xE9), "\u00E9");
+	}
+
+	function testSplit() {
+		var parts = "a\u00E9b\u00E9c".split("\u00E9");
+		eq(parts.length, 3);
+		eq(parts[0], "a");
+		eq(parts[1], "b");
+		eq(parts[2], "c");
+	}
+#else
+	function test() {
+		noAssert();
+	}
+#end
+}