AK+Everywhere: Recognise that surrogates in utf16 aren't all that common

For the slight cost of counting code points when converting between
encodings and a teeny bit of memory, this commit adds a fast path for
all-happy utf-16 substrings and code point operations.

This seems to be a significant chunk of time spent in many regex
benchmarks.
This commit is contained in:
Ali Mohammad Pur 2025-04-02 17:56:49 +02:00 committed by Andrew Kaster
parent 86c756a589
commit eea81738cd
Notes: github-actions[bot] 2025-04-23 13:57:06 +00:00
11 changed files with 74 additions and 37 deletions

View file

@ -37,35 +37,40 @@ static constexpr u16 host_code_unit(u16 code_unit, Endianness endianness)
}
template<OneOf<Utf8View, Utf32View> UtfViewType>
static ErrorOr<Utf16Data> to_utf16_slow(UtfViewType const& view, Endianness endianness)
static ErrorOr<Utf16ConversionResult> to_utf16_slow(UtfViewType const& view, Endianness endianness)
{
Utf16Data utf16_data;
TRY(utf16_data.try_ensure_capacity(view.length()));
for (auto code_point : view)
size_t code_point_count = 0;
for (auto code_point : view) {
TRY(code_point_to_utf16(utf16_data, code_point, endianness));
code_point_count++;
}
return utf16_data;
return Utf16ConversionResult { move(utf16_data), code_point_count };
}
ErrorOr<Utf16Data> utf8_to_utf16(StringView utf8_view, Endianness endianness)
ErrorOr<Utf16ConversionResult> utf8_to_utf16(StringView utf8_view, Endianness endianness)
{
return utf8_to_utf16(Utf8View { utf8_view }, endianness);
}
ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view, Endianness endianness)
ErrorOr<Utf16ConversionResult> utf8_to_utf16(Utf8View const& utf8_view, Endianness endianness)
{
// All callers want to allow lonely surrogates, which simdutf does not permit.
if (!utf8_view.validate(Utf8View::AllowSurrogates::No)) [[unlikely]]
return to_utf16_slow(utf8_view, endianness);
if (utf8_view.is_empty())
return Utf16Data {};
return Utf16ConversionResult { Utf16Data {}, 0 };
auto const* data = reinterpret_cast<char const*>(utf8_view.bytes());
auto length = utf8_view.byte_length();
Utf16Data utf16_data;
TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf8(data, length)));
// FIXME: simdutf _could_ be telling us about this, but it doesn't -- so we have to compute it again.
auto code_point_length = simdutf::count_utf8(data, length);
[[maybe_unused]] auto result = [&]() {
switch (endianness) {
@ -80,13 +85,13 @@ ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view, Endianness endiannes
}();
ASSERT(result == utf16_data.size());
return utf16_data;
return Utf16ConversionResult { utf16_data, code_point_length };
}
ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const& utf32_view, Endianness endianness)
ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const& utf32_view, Endianness endianness)
{
if (utf32_view.is_empty())
return Utf16Data {};
return Utf16ConversionResult { Utf16Data {}, 0 };
auto const* data = reinterpret_cast<char32_t const*>(utf32_view.code_points());
auto length = utf32_view.length();
@ -107,7 +112,7 @@ ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const& utf32_view, Endianness endian
}();
ASSERT(result == utf16_data.size());
return utf16_data;
return Utf16ConversionResult { utf16_data, length };
}
ErrorOr<void> code_point_to_utf16(Utf16Data& string, u32 code_point, Endianness endianness)
@ -207,6 +212,9 @@ u32 Utf16View::code_point_at(size_t index) const
size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
{
if (m_length_in_code_points == m_code_units.size()) // Fast path: all code points are one code unit.
return code_unit_offset;
size_t code_point_offset = 0;
for (auto it = begin(); it != end(); ++it) {
@ -222,6 +230,9 @@ size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const
{
if (m_length_in_code_points == m_code_units.size()) // Fast path: all code points are one code unit.
return code_point_offset;
size_t code_unit_offset = 0;
for (auto it = begin(); it != end(); ++it) {
@ -256,6 +267,9 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
if (code_point_length == 0)
return {};
if (m_length_in_code_points == m_code_units.size()) // Fast path: all code points are one code unit.
return substring_view(code_point_offset, code_point_length);
auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) { return it.m_ptr - begin_ptr(); };
size_t code_point_index = 0;
size_t code_unit_offset = 0;

View file

@ -21,9 +21,13 @@ namespace AK {
using Utf16Data = Vector<u16, 1>;
ErrorOr<Utf16Data> utf8_to_utf16(StringView, Endianness = Endianness::Host);
ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host);
ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host);
struct Utf16ConversionResult {
Utf16Data data;
size_t code_point_count;
};
ErrorOr<Utf16ConversionResult> utf8_to_utf16(StringView, Endianness = Endianness::Host);
ErrorOr<Utf16ConversionResult> utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host);
ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host);
ErrorOr<void> code_point_to_utf16(Utf16Data&, u32, Endianness = Endianness::Host);
[[nodiscard]] bool validate_utf16_le(ReadonlyBytes);
@ -77,6 +81,13 @@ public:
{
}
Utf16View(Utf16ConversionResult&&) = delete;
explicit Utf16View(Utf16ConversionResult const& conversion_result)
: m_code_units(conversion_result.data)
, m_length_in_code_points(conversion_result.code_point_count)
{
}
template<size_t Size>
Utf16View(char16_t const (&code_units)[Size])
: m_code_units(
@ -95,6 +106,8 @@ public:
ErrorOr<ByteString> to_byte_string(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
ErrorOr<String> to_utf8(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
void unsafe_set_code_point_length(size_t length) const { m_length_in_code_points = length; }
bool is_null() const { return m_code_units.is_null(); }
bool is_empty() const { return m_code_units.is_empty(); }
size_t length_in_code_units() const { return m_code_units.size(); }

View file

@ -572,7 +572,8 @@ JS_DEFINE_NATIVE_FUNCTION(GlobalObject::escape)
// 2. Let length be the length of string.
// 5. Let k be 0.
// 6. Repeat, while k < length,
for (auto code_point : TRY_OR_THROW_OOM(vm, utf8_to_utf16(string))) {
auto utf16_conversion = TRY_OR_THROW_OOM(vm, utf8_to_utf16(string));
for (auto code_point : utf16_conversion.data) {
// a. Let char be the code unit at index k within string.
// b. If unescapedSet contains char, then

View file

@ -97,8 +97,8 @@ ErrorOr<String, ParseRegexPatternError> parse_regex_pattern(StringView pattern,
if (utf16_pattern_result.is_error())
return ParseRegexPatternError { "Out of memory"_string };
auto utf16_pattern = utf16_pattern_result.release_value();
Utf16View utf16_pattern_view { utf16_pattern };
auto utf16_result = utf16_pattern_result.release_value();
Utf16View utf16_pattern_view { utf16_result };
StringBuilder builder;
// If the Unicode flag is set, append each code point to the pattern. Otherwise, append each

View file

@ -34,7 +34,10 @@ NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16Data string)
NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(StringView string)
{
return create(MUST(utf8_to_utf16(string)));
auto result = MUST(utf8_to_utf16(string));
auto impl = create(move(result.data));
impl->m_cached_view.unsafe_set_code_point_length(result.code_point_count);
return impl;
}
NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16View const& view)
@ -42,7 +45,9 @@ NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16View const& view)
Utf16Data string;
string.ensure_capacity(view.length_in_code_units());
string.unchecked_append(view.data(), view.length_in_code_units());
return create(move(string));
auto impl = create(move(string));
impl->m_cached_view.unsafe_set_code_point_length(view.length_in_code_units());
return impl;
}
Utf16Data const& Utf16StringImpl::string() const
@ -52,7 +57,7 @@ Utf16Data const& Utf16StringImpl::string() const
Utf16View Utf16StringImpl::view() const
{
return Utf16View { m_string };
return m_cached_view;
}
u32 Utf16StringImpl::compute_hash() const

View file

@ -48,6 +48,7 @@ private:
mutable bool m_has_hash { false };
mutable u32 m_hash { 0 };
Utf16Data m_string;
Utf16View m_cached_view { m_string.span() };
};
}

View file

@ -113,16 +113,19 @@ public:
RegexStringView construct_as_same(Span<u32> data, Optional<ByteString>& optional_string_storage, Utf16Data& optional_utf16_storage) const
{
auto view = m_view.visit(
[&]<typename T>(T const&) {
[&optional_string_storage, data]<typename T>(T const&) {
StringBuilder builder;
for (auto ch : data)
builder.append(ch); // Note: The type conversion is intentional.
optional_string_storage = builder.to_byte_string();
return RegexStringView { T { *optional_string_storage } };
},
[&](Utf16View) {
optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() }).release_value_but_fixme_should_propagate_errors();
return RegexStringView { Utf16View { optional_utf16_storage } };
[&optional_utf16_storage, data](Utf16View) {
auto conversion_result = utf32_to_utf16(Utf32View { data.data(), data.size() }).release_value_but_fixme_should_propagate_errors();
optional_utf16_storage = conversion_result.data;
auto view = Utf16View { optional_utf16_storage };
view.unsafe_set_code_point_length(conversion_result.code_point_count);
return RegexStringView { view };
});
view.set_unicode(unicode());

View file

@ -46,8 +46,8 @@ WebIDL::ExceptionOr<String> CharacterData::substring_data(size_t offset, size_t
{
// 1. Let length be nodes length.
// FIXME: This is very inefficient!
auto utf16_data = MUST(AK::utf8_to_utf16(m_data));
Utf16View utf16_view { utf16_data };
auto utf16_result = MUST(AK::utf8_to_utf16(m_data));
Utf16View utf16_view { utf16_result };
auto length = utf16_view.length_in_code_units();
// 2. If offset is greater than length, then throw an "IndexSizeError" DOMException.
@ -84,12 +84,12 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
// 6. Let delete offset be offset + datas length.
// 7. Starting from delete offset code units, remove count code units from nodes data.
auto before_data = utf16_view.substring_view(0, offset);
auto inserted_data = MUST(AK::utf8_to_utf16(data));
auto inserted_data_result = MUST(AK::utf8_to_utf16(data));
auto after_data = utf16_view.substring_view(offset + count);
Utf16Data full_data;
full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data.size() + after_data.length_in_code_units());
full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units());
full_data.append(before_data.data(), before_data.length_in_code_units());
full_data.extend(inserted_data);
full_data.extend(inserted_data_result.data);
full_data.append(after_data.data(), after_data.length_in_code_units());
Utf16View full_view { full_data };
@ -120,14 +120,14 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
// 10. For each live range whose start node is node and start offset is greater than offset plus count, increase its start offset by datas length and decrease it by count.
for (auto& range : Range::live_ranges()) {
if (range->start_container() == this && range->start_offset() > (offset + count))
TRY(range->set_start(*range->start_container(), range->start_offset() + inserted_data.size() - count));
TRY(range->set_start(*range->start_container(), range->start_offset() + inserted_data_result.data.size() - count));
}
// 11. For each live range whose end node is node and end offset is greater than offset plus count, increase its end offset by datas length and decrease it by count.
for (auto& range : Range::live_ranges()) {
if (range->end_container() == this && range->end_offset() > (offset + count)) {
// AD-HOC: Clamp offset to the end of the data if it's too large.
auto new_offset = min(range->end_offset() + inserted_data.size() - count, length_in_utf16_code_units());
auto new_offset = min(range->end_offset() + inserted_data_result.data.size() - count, length_in_utf16_code_units());
TRY(range->set_end(*range->end_container(), new_offset));
}
}

View file

@ -327,8 +327,8 @@ void URLSearchParams::sort()
// 1. Sort all name-value pairs, if any, by their names. Sorting must be done by comparison of code units. The relative order between name-value pairs with equal names must be preserved.
insertion_sort(m_list, [](auto& a, auto& b) {
// FIXME: There should be a way to do this without converting to utf16
auto a_utf16 = MUST(utf8_to_utf16(a.name));
auto b_utf16 = MUST(utf8_to_utf16(b.name));
auto a_utf16 = MUST(utf8_to_utf16(a.name)).data;
auto b_utf16 = MUST(utf8_to_utf16(b.name)).data;
auto common_length = min(a_utf16.size(), b_utf16.size());

View file

@ -54,7 +54,7 @@ ByteString SVGTextContentElement::text_contents() const
// https://svgwg.org/svg2-draft/text.html#__svg__SVGTextContentElement__getNumberOfChars
WebIDL::ExceptionOr<WebIDL::Long> SVGTextContentElement::get_number_of_chars() const
{
auto chars = TRY_OR_THROW_OOM(vm(), utf8_to_utf16(text_contents()));
auto chars = TRY_OR_THROW_OOM(vm(), utf8_to_utf16(text_contents())).data;
return static_cast<WebIDL::Long>(chars.size());
}

View file

@ -160,16 +160,16 @@ TEST_CASE(out_of_bounds)
auto segmenter = Unicode::Segmenter::create(Unicode::SegmenterGranularity::Word);
segmenter->set_segmented_text(Utf16View { text });
auto result = segmenter->previous_boundary(text.size() + 1);
auto result = segmenter->previous_boundary(text.data.size() + 1);
EXPECT(result.has_value());
result = segmenter->next_boundary(text.size() + 1);
result = segmenter->next_boundary(text.data.size() + 1);
EXPECT(!result.has_value());
result = segmenter->previous_boundary(text.size());
result = segmenter->previous_boundary(text.data.size());
EXPECT(result.has_value());
result = segmenter->next_boundary(text.size());
result = segmenter->next_boundary(text.data.size());
EXPECT(!result.has_value());
result = segmenter->next_boundary(0);