AK+Everywhere: Recognise that surrogates in utf16 aren't all that common

For the slight cost of counting code points when converting between
encodings and a teeny bit of memory, this commit adds a fast path for
all-happy utf-16 substrings and code point operations.

This seems to be a significant chunk of time spent in many regex
benchmarks.
This commit is contained in:
Ali Mohammad Pur 2025-04-02 17:56:49 +02:00 committed by Andrew Kaster
commit eea81738cd
Notes: github-actions[bot] 2025-04-23 13:57:06 +00:00
11 changed files with 74 additions and 37 deletions

View file

@ -34,7 +34,10 @@ NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16Data string)
NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(StringView string)
{
return create(MUST(utf8_to_utf16(string)));
auto result = MUST(utf8_to_utf16(string));
auto impl = create(move(result.data));
impl->m_cached_view.unsafe_set_code_point_length(result.code_point_count);
return impl;
}
NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16View const& view)
@ -42,7 +45,9 @@ NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16View const& view)
Utf16Data string;
string.ensure_capacity(view.length_in_code_units());
string.unchecked_append(view.data(), view.length_in_code_units());
return create(move(string));
auto impl = create(move(string));
impl->m_cached_view.unsafe_set_code_point_length(view.length_in_code_units());
return impl;
}
Utf16Data const& Utf16StringImpl::string() const
@ -52,7 +57,7 @@ Utf16Data const& Utf16StringImpl::string() const
Utf16View Utf16StringImpl::view() const
{
return Utf16View { m_string };
return m_cached_view;
}
u32 Utf16StringImpl::compute_hash() const