Skip to content

Commit 145c6f9

Browse files
Copilotyanyiwu
andcommitted
Fix heap corruption for lines larger than 2114 bytes
- Replace RuneStrArray (LocalVector<RuneStr>) with std::vector<RuneStr> since LocalVector explicitly warns it's only safe for primitive types, not structs like RuneStr - Change loop counter in DecodeUTF8RunesInString from uint32_t to size_t to prevent integer overflow for very long strings, with an explicit overflow check before truncating to uint32_t for RuneStr fields - Update UnicodeTest.Test1 expected string to match std::vector operator<< format - Add MixSegmentTest.LongInput regression test for strings > 2114 bytes Co-authored-by: yanyiwu <2162645+yanyiwu@users.noreply.github.com>
1 parent 41312ea commit 145c6f9

File tree

3 files changed

+46
-4
lines changed

3 files changed

+46
-4
lines changed

include/cppjieba/Unicode.hpp

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ inline std::ostream& operator << (std::ostream& os, const RuneStr& r) {
5353
}
5454

5555
typedef limonp::LocalVector<Rune> Unicode;
56-
typedef limonp::LocalVector<struct RuneStr> RuneStrArray;
56+
typedef std::vector<RuneStr> RuneStrArray;
5757

5858
// [left, right]
5959
struct WordRange {
@@ -142,13 +142,17 @@ inline RuneStrLite DecodeUTF8ToRune(const char* str, size_t len) {
142142
inline bool DecodeUTF8RunesInString(const char* s, size_t len, RuneStrArray& runes) {
143143
runes.clear();
144144
runes.reserve(len / 2);
145-
for (uint32_t i = 0, j = 0; i < len;) {
145+
for (size_t i = 0, j = 0; i < len;) {
146146
RuneStrLite rp = DecodeUTF8ToRune(s + i, len - i);
147147
if (rp.len == 0) {
148148
runes.clear();
149149
return false;
150150
}
151-
RuneStr x(rp.rune, i, rp.len, j, 1);
151+
if (i > UINT32_MAX || j > UINT32_MAX) {
152+
runes.clear();
153+
return false;
154+
}
155+
RuneStr x(rp.rune, static_cast<uint32_t>(i), rp.len, static_cast<uint32_t>(j), 1);
152156
runes.push_back(x);
153157
i += rp.len;
154158
++j;

test/unittest/segments_test.cpp

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -259,3 +259,41 @@ TEST(MPSegmentTest, Unicode32) {
259259

260260
ASSERT_EQ(Join(words.begin(), words.end(), "/"), "天气/很/好/,/🙋/ /我们/去/郊游/。");
261261
}
262+
263+
// Regression test for heap corruption ("corrupted size vs. prev_size") with
264+
// input strings larger than 2114 bytes. The bug was caused by using
265+
// LocalVector<RuneStr> (only safe for primitive types) for RuneStrArray.
266+
TEST(MixSegmentTest, LongInput) {
267+
// 2114 is the byte length threshold beyond which the original heap
268+
// corruption was triggered when RuneStrArray used LocalVector<RuneStr>.
269+
const size_t HEAP_CORRUPTION_THRESHOLD = 2114;
270+
271+
MixSegment segment(DICT_DIR "/jieba.dict.utf8", DICT_DIR "/hmm_model.utf8");
272+
vector<string> words;
273+
274+
// Test with a long Chinese string (> HEAP_CORRUPTION_THRESHOLD bytes)
275+
string phrase = "我来到北京清华大学进行学习和研究工作,非常愉快,让我有了很大的收获。";
276+
string long_chinese;
277+
while (long_chinese.size() < HEAP_CORRUPTION_THRESHOLD + 1000) {
278+
long_chinese += phrase;
279+
}
280+
ASSERT_GT(long_chinese.size(), HEAP_CORRUPTION_THRESHOLD);
281+
segment.Cut(long_chinese, words);
282+
ASSERT_GT(words.size(), size_t(0));
283+
284+
// Test with a long ASCII string (> HEAP_CORRUPTION_THRESHOLD bytes)
285+
string long_ascii(HEAP_CORRUPTION_THRESHOLD + 1000, 'a');
286+
words.clear();
287+
segment.Cut(long_ascii, words);
288+
ASSERT_GT(words.size(), size_t(0));
289+
290+
// Test with a very long string (> 6000 bytes)
291+
string very_long_chinese;
292+
while (very_long_chinese.size() < 6000) {
293+
very_long_chinese += phrase;
294+
}
295+
ASSERT_GT(very_long_chinese.size(), size_t(6000));
296+
words.clear();
297+
segment.Cut(very_long_chinese, words);
298+
ASSERT_GT(words.size(), size_t(0));
299+
}

test/unittest/unicode_test.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ TEST(UnicodeTest, Test1) {
1010
RuneStrArray runes;
1111
ASSERT_TRUE(DecodeUTF8RunesInString(s, runes));
1212
string actual;
13-
string expected = "[\"{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}\", \"{\"rune\": \"22909\", \"offset\": 3, \"len\": 3}\", \"{\"rune\": \"19990\", \"offset\": 6, \"len\": 3}\", \"{\"rune\": \"30028\", \"offset\": 9, \"len\": 3}\"]";
13+
string expected = "[{\"rune\": \"20320\", \"offset\": 0, \"len\": 3}, {\"rune\": \"22909\", \"offset\": 3, \"len\": 3}, {\"rune\": \"19990\", \"offset\": 6, \"len\": 3}, {\"rune\": \"30028\", \"offset\": 9, \"len\": 3}]";
1414
actual << runes;
1515
ASSERT_EQ(expected, actual);
1616
}

0 commit comments

Comments
 (0)