AK+LibJS: Do not set UTF-16 code point length to its code unit length

This commit is contained in:
Timothy Flynn 2025-06-25 15:52:01 -04:00 committed by Jelle Raaijmakers
parent 594194eb60
commit efa9737cf7
Notes: github-actions[bot] 2025-06-25 20:22:15 +00:00
3 changed files with 25 additions and 1 deletions

View file

@ -113,6 +113,13 @@ public:
size_t length_in_code_units() const { return m_code_units.size(); } size_t length_in_code_units() const { return m_code_units.size(); }
size_t length_in_code_points() const; size_t length_in_code_points() const;
Optional<size_t> length_in_code_points_if_known() const
{
if (m_length_in_code_points == NumericLimits<size_t>::max())
return {};
return m_length_in_code_points;
}
Utf16CodePointIterator begin() const { return { begin_ptr(), m_code_units.size() }; } Utf16CodePointIterator begin() const { return { begin_ptr(), m_code_units.size() }; }
Utf16CodePointIterator end() const { return { end_ptr(), 0 }; } Utf16CodePointIterator end() const { return { end_ptr(), 0 }; }

View file

@ -45,8 +45,11 @@ NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16View const& view)
Utf16Data string; Utf16Data string;
string.ensure_capacity(view.length_in_code_units()); string.ensure_capacity(view.length_in_code_units());
string.unchecked_append(view.data(), view.length_in_code_units()); string.unchecked_append(view.data(), view.length_in_code_units());
auto impl = create(move(string)); auto impl = create(move(string));
impl->m_cached_view.unsafe_set_code_point_length(view.length_in_code_units()); if (auto length_in_code_points = view.length_in_code_points_if_known(); length_in_code_points.has_value())
impl->m_cached_view.unsafe_set_code_point_length(*length_in_code_points);
return impl; return impl;
} }

View file

@ -212,3 +212,17 @@ test("string coercion", () => {
expect(result[0]).toBe("1"); expect(result[0]).toBe("1");
expect(result.index).toBe(0); expect(result.index).toBe(0);
}); });
test("cached UTF-16 code point length", () => {
// This exercises a regression where we incorrectly cached the code point length of the `match` string,
// causing subsequent code point lookups on that string to be incorrect.
const regex = /\p{Emoji_Presentation}/u;
let result = regex.exec("😀");
let match = result[0];
result = regex.exec(match);
match = result[0];
expect(match.codePointAt(0)).toBe(0x1f600);
});