AK+LibJS: Do not set UTF-16 code point length to its code unit length

Author: https://github.com/trflynn89 Commit: efa9737cf7 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5214 Reviewed-by: https://github.com/gmta ✅
2025-07-24 18:02:20 +00:00 · 2025-06-25 15:52:01 -04:00 · 2025-06-25 15:52:01 -04:00 · efa9737cf7 · 2025-06-25 20:22:15 +00:00
commit efa9737cf7
parent 594194eb60
3 changed files with 25 additions and 1 deletions
--- a/AK/Utf16View.h
+++ b/AK/Utf16View.h
@ -113,6 +113,13 @@ public:
    size_t length_in_code_units() const { return m_code_units.size(); }
    size_t length_in_code_points() const;
    Optional<size_t> length_in_code_points_if_known() const
    {
        if (m_length_in_code_points == NumericLimits<size_t>::max())
            return {};
        return m_length_in_code_points;
    }
    Utf16CodePointIterator begin() const { return { begin_ptr(), m_code_units.size() }; }
    Utf16CodePointIterator end() const { return { end_ptr(), 0 }; }
--- a/Libraries/LibJS/Runtime/Utf16String.cpp
+++ b/Libraries/LibJS/Runtime/Utf16String.cpp
@ -45,8 +45,11 @@ NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16View const& view)
    Utf16Data string;
    string.ensure_capacity(view.length_in_code_units());
    string.unchecked_append(view.data(), view.length_in_code_units());
    auto impl = create(move(string));
-    impl->m_cached_view.unsafe_set_code_point_length(view.length_in_code_units());
+    if (auto length_in_code_points = view.length_in_code_points_if_known(); length_in_code_points.has_value())
        impl->m_cached_view.unsafe_set_code_point_length(*length_in_code_points);
    return impl;
 }
--- a/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.exec.js
+++ b/Libraries/LibJS/Tests/builtins/RegExp/RegExp.prototype.exec.js
@ -212,3 +212,17 @@ test("string coercion", () => {
    expect(result[0]).toBe("1");
    expect(result.index).toBe(0);
 });
 test("cached UTF-16 code point length", () => {
    // This exercises a regression where we incorrectly cached the code point length of the `match` string,
    // causing subsequent code point lookups on that string to be incorrect.
    const regex = /\p{Emoji_Presentation}/u;
    let result = regex.exec("😀");
    let match = result[0];
    result = regex.exec(match);
    match = result[0];
    expect(match.codePointAt(0)).toBe(0x1f600);
 });