AK+Everywhere: Recognise that surrogates in utf16 aren't all that common

For the slight cost of counting code points when converting between encodings and a teeny bit of memory, this commit adds a fast path for all-happy utf-16 substrings and code point operations. This seems to be a significant chunk of time spent in many regex benchmarks.
Author: https://github.com/alimpfard Commit: eea81738cd Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/4196 Reviewed-by: https://github.com/ADKaster ✅
2025-08-04 07:09:47 +00:00 · 2025-04-02 17:56:49 +02:00 · 2025-04-02 17:56:49 +02:00 · eea81738cd · 2025-04-23 13:57:06 +00:00
commit eea81738cd
parent 86c756a589
11 changed files with 74 additions and 37 deletions
--- a/Libraries/LibJS/Runtime/Utf16String.cpp
+++ b/Libraries/LibJS/Runtime/Utf16String.cpp
@ -34,7 +34,10 @@ NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16Data string)

 NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(StringView string)
 {
-    return create(MUST(utf8_to_utf16(string)));
+    auto result = MUST(utf8_to_utf16(string));
+    auto impl = create(move(result.data));
+    impl->m_cached_view.unsafe_set_code_point_length(result.code_point_count);
+    return impl;
 }

 NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16View const& view)
@ -42,7 +45,9 @@ NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16View const& view)
    Utf16Data string;
    string.ensure_capacity(view.length_in_code_units());
    string.unchecked_append(view.data(), view.length_in_code_units());
-    return create(move(string));
+    auto impl = create(move(string));
+    impl->m_cached_view.unsafe_set_code_point_length(view.length_in_code_units());
+    return impl;
 }

 Utf16Data const& Utf16StringImpl::string() const
@ -52,7 +57,7 @@ Utf16Data const& Utf16StringImpl::string() const

 Utf16View Utf16StringImpl::view() const
 {
-    return Utf16View { m_string };
+    return m_cached_view;
 }

 u32 Utf16StringImpl::compute_hash() const