mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-04-24 05:25:13 +00:00
AK+Everywhere: Recognise that surrogates in utf16 aren't all that common
For the slight cost of counting code points when converting between encodings and a teeny bit of memory, this commit adds a fast path for all-happy utf-16 substrings and code point operations. This seems to be a significant chunk of time spent in many regex benchmarks.
This commit is contained in:
parent
86c756a589
commit
eea81738cd
Notes:
github-actions[bot]
2025-04-23 13:57:06 +00:00
Author: https://github.com/alimpfard Commit: https://github.com/LadybirdBrowser/ladybird/commit/eea81738cd8 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/4196 Reviewed-by: https://github.com/ADKaster ✅
11 changed files with 74 additions and 37 deletions
|
@ -37,35 +37,40 @@ static constexpr u16 host_code_unit(u16 code_unit, Endianness endianness)
|
|||
}
|
||||
|
||||
template<OneOf<Utf8View, Utf32View> UtfViewType>
|
||||
static ErrorOr<Utf16Data> to_utf16_slow(UtfViewType const& view, Endianness endianness)
|
||||
static ErrorOr<Utf16ConversionResult> to_utf16_slow(UtfViewType const& view, Endianness endianness)
|
||||
{
|
||||
Utf16Data utf16_data;
|
||||
TRY(utf16_data.try_ensure_capacity(view.length()));
|
||||
|
||||
for (auto code_point : view)
|
||||
size_t code_point_count = 0;
|
||||
for (auto code_point : view) {
|
||||
TRY(code_point_to_utf16(utf16_data, code_point, endianness));
|
||||
code_point_count++;
|
||||
}
|
||||
|
||||
return utf16_data;
|
||||
return Utf16ConversionResult { move(utf16_data), code_point_count };
|
||||
}
|
||||
|
||||
ErrorOr<Utf16Data> utf8_to_utf16(StringView utf8_view, Endianness endianness)
|
||||
ErrorOr<Utf16ConversionResult> utf8_to_utf16(StringView utf8_view, Endianness endianness)
|
||||
{
|
||||
return utf8_to_utf16(Utf8View { utf8_view }, endianness);
|
||||
}
|
||||
|
||||
ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view, Endianness endianness)
|
||||
ErrorOr<Utf16ConversionResult> utf8_to_utf16(Utf8View const& utf8_view, Endianness endianness)
|
||||
{
|
||||
// All callers want to allow lonely surrogates, which simdutf does not permit.
|
||||
if (!utf8_view.validate(Utf8View::AllowSurrogates::No)) [[unlikely]]
|
||||
return to_utf16_slow(utf8_view, endianness);
|
||||
if (utf8_view.is_empty())
|
||||
return Utf16Data {};
|
||||
return Utf16ConversionResult { Utf16Data {}, 0 };
|
||||
|
||||
auto const* data = reinterpret_cast<char const*>(utf8_view.bytes());
|
||||
auto length = utf8_view.byte_length();
|
||||
|
||||
Utf16Data utf16_data;
|
||||
TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf8(data, length)));
|
||||
// FIXME: simdutf _could_ be telling us about this, but it doesn't -- so we have to compute it again.
|
||||
auto code_point_length = simdutf::count_utf8(data, length);
|
||||
|
||||
[[maybe_unused]] auto result = [&]() {
|
||||
switch (endianness) {
|
||||
|
@ -80,13 +85,13 @@ ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const& utf8_view, Endianness endiannes
|
|||
}();
|
||||
ASSERT(result == utf16_data.size());
|
||||
|
||||
return utf16_data;
|
||||
return Utf16ConversionResult { utf16_data, code_point_length };
|
||||
}
|
||||
|
||||
ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const& utf32_view, Endianness endianness)
|
||||
ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const& utf32_view, Endianness endianness)
|
||||
{
|
||||
if (utf32_view.is_empty())
|
||||
return Utf16Data {};
|
||||
return Utf16ConversionResult { Utf16Data {}, 0 };
|
||||
|
||||
auto const* data = reinterpret_cast<char32_t const*>(utf32_view.code_points());
|
||||
auto length = utf32_view.length();
|
||||
|
@ -107,7 +112,7 @@ ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const& utf32_view, Endianness endian
|
|||
}();
|
||||
ASSERT(result == utf16_data.size());
|
||||
|
||||
return utf16_data;
|
||||
return Utf16ConversionResult { utf16_data, length };
|
||||
}
|
||||
|
||||
ErrorOr<void> code_point_to_utf16(Utf16Data& string, u32 code_point, Endianness endianness)
|
||||
|
@ -207,6 +212,9 @@ u32 Utf16View::code_point_at(size_t index) const
|
|||
|
||||
size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
|
||||
{
|
||||
if (m_length_in_code_points == m_code_units.size()) // Fast path: all code points are one code unit.
|
||||
return code_unit_offset;
|
||||
|
||||
size_t code_point_offset = 0;
|
||||
|
||||
for (auto it = begin(); it != end(); ++it) {
|
||||
|
@ -222,6 +230,9 @@ size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
|
|||
|
||||
size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const
|
||||
{
|
||||
if (m_length_in_code_points == m_code_units.size()) // Fast path: all code points are one code unit.
|
||||
return code_point_offset;
|
||||
|
||||
size_t code_unit_offset = 0;
|
||||
|
||||
for (auto it = begin(); it != end(); ++it) {
|
||||
|
@ -256,6 +267,9 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
|
|||
if (code_point_length == 0)
|
||||
return {};
|
||||
|
||||
if (m_length_in_code_points == m_code_units.size()) // Fast path: all code points are one code unit.
|
||||
return substring_view(code_point_offset, code_point_length);
|
||||
|
||||
auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) { return it.m_ptr - begin_ptr(); };
|
||||
size_t code_point_index = 0;
|
||||
size_t code_unit_offset = 0;
|
||||
|
|
|
@ -21,9 +21,13 @@ namespace AK {
|
|||
|
||||
using Utf16Data = Vector<u16, 1>;
|
||||
|
||||
ErrorOr<Utf16Data> utf8_to_utf16(StringView, Endianness = Endianness::Host);
|
||||
ErrorOr<Utf16Data> utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host);
|
||||
ErrorOr<Utf16Data> utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host);
|
||||
struct Utf16ConversionResult {
|
||||
Utf16Data data;
|
||||
size_t code_point_count;
|
||||
};
|
||||
ErrorOr<Utf16ConversionResult> utf8_to_utf16(StringView, Endianness = Endianness::Host);
|
||||
ErrorOr<Utf16ConversionResult> utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host);
|
||||
ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host);
|
||||
ErrorOr<void> code_point_to_utf16(Utf16Data&, u32, Endianness = Endianness::Host);
|
||||
|
||||
[[nodiscard]] bool validate_utf16_le(ReadonlyBytes);
|
||||
|
@ -77,6 +81,13 @@ public:
|
|||
{
|
||||
}
|
||||
|
||||
Utf16View(Utf16ConversionResult&&) = delete;
|
||||
explicit Utf16View(Utf16ConversionResult const& conversion_result)
|
||||
: m_code_units(conversion_result.data)
|
||||
, m_length_in_code_points(conversion_result.code_point_count)
|
||||
{
|
||||
}
|
||||
|
||||
template<size_t Size>
|
||||
Utf16View(char16_t const (&code_units)[Size])
|
||||
: m_code_units(
|
||||
|
@ -95,6 +106,8 @@ public:
|
|||
ErrorOr<ByteString> to_byte_string(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
|
||||
ErrorOr<String> to_utf8(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
|
||||
|
||||
void unsafe_set_code_point_length(size_t length) const { m_length_in_code_points = length; }
|
||||
|
||||
bool is_null() const { return m_code_units.is_null(); }
|
||||
bool is_empty() const { return m_code_units.is_empty(); }
|
||||
size_t length_in_code_units() const { return m_code_units.size(); }
|
||||
|
|
|
@ -572,7 +572,8 @@ JS_DEFINE_NATIVE_FUNCTION(GlobalObject::escape)
|
|||
// 2. Let length be the length of string.
|
||||
// 5. Let k be 0.
|
||||
// 6. Repeat, while k < length,
|
||||
for (auto code_point : TRY_OR_THROW_OOM(vm, utf8_to_utf16(string))) {
|
||||
auto utf16_conversion = TRY_OR_THROW_OOM(vm, utf8_to_utf16(string));
|
||||
for (auto code_point : utf16_conversion.data) {
|
||||
// a. Let char be the code unit at index k within string.
|
||||
|
||||
// b. If unescapedSet contains char, then
|
||||
|
|
|
@ -97,8 +97,8 @@ ErrorOr<String, ParseRegexPatternError> parse_regex_pattern(StringView pattern,
|
|||
if (utf16_pattern_result.is_error())
|
||||
return ParseRegexPatternError { "Out of memory"_string };
|
||||
|
||||
auto utf16_pattern = utf16_pattern_result.release_value();
|
||||
Utf16View utf16_pattern_view { utf16_pattern };
|
||||
auto utf16_result = utf16_pattern_result.release_value();
|
||||
Utf16View utf16_pattern_view { utf16_result };
|
||||
StringBuilder builder;
|
||||
|
||||
// If the Unicode flag is set, append each code point to the pattern. Otherwise, append each
|
||||
|
|
|
@ -34,7 +34,10 @@ NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16Data string)
|
|||
|
||||
NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(StringView string)
|
||||
{
|
||||
return create(MUST(utf8_to_utf16(string)));
|
||||
auto result = MUST(utf8_to_utf16(string));
|
||||
auto impl = create(move(result.data));
|
||||
impl->m_cached_view.unsafe_set_code_point_length(result.code_point_count);
|
||||
return impl;
|
||||
}
|
||||
|
||||
NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16View const& view)
|
||||
|
@ -42,7 +45,9 @@ NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16View const& view)
|
|||
Utf16Data string;
|
||||
string.ensure_capacity(view.length_in_code_units());
|
||||
string.unchecked_append(view.data(), view.length_in_code_units());
|
||||
return create(move(string));
|
||||
auto impl = create(move(string));
|
||||
impl->m_cached_view.unsafe_set_code_point_length(view.length_in_code_units());
|
||||
return impl;
|
||||
}
|
||||
|
||||
Utf16Data const& Utf16StringImpl::string() const
|
||||
|
@ -52,7 +57,7 @@ Utf16Data const& Utf16StringImpl::string() const
|
|||
|
||||
Utf16View Utf16StringImpl::view() const
|
||||
{
|
||||
return Utf16View { m_string };
|
||||
return m_cached_view;
|
||||
}
|
||||
|
||||
u32 Utf16StringImpl::compute_hash() const
|
||||
|
|
|
@ -48,6 +48,7 @@ private:
|
|||
mutable bool m_has_hash { false };
|
||||
mutable u32 m_hash { 0 };
|
||||
Utf16Data m_string;
|
||||
Utf16View m_cached_view { m_string.span() };
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -113,16 +113,19 @@ public:
|
|||
RegexStringView construct_as_same(Span<u32> data, Optional<ByteString>& optional_string_storage, Utf16Data& optional_utf16_storage) const
|
||||
{
|
||||
auto view = m_view.visit(
|
||||
[&]<typename T>(T const&) {
|
||||
[&optional_string_storage, data]<typename T>(T const&) {
|
||||
StringBuilder builder;
|
||||
for (auto ch : data)
|
||||
builder.append(ch); // Note: The type conversion is intentional.
|
||||
optional_string_storage = builder.to_byte_string();
|
||||
return RegexStringView { T { *optional_string_storage } };
|
||||
},
|
||||
[&](Utf16View) {
|
||||
optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() }).release_value_but_fixme_should_propagate_errors();
|
||||
return RegexStringView { Utf16View { optional_utf16_storage } };
|
||||
[&optional_utf16_storage, data](Utf16View) {
|
||||
auto conversion_result = utf32_to_utf16(Utf32View { data.data(), data.size() }).release_value_but_fixme_should_propagate_errors();
|
||||
optional_utf16_storage = conversion_result.data;
|
||||
auto view = Utf16View { optional_utf16_storage };
|
||||
view.unsafe_set_code_point_length(conversion_result.code_point_count);
|
||||
return RegexStringView { view };
|
||||
});
|
||||
|
||||
view.set_unicode(unicode());
|
||||
|
|
|
@ -46,8 +46,8 @@ WebIDL::ExceptionOr<String> CharacterData::substring_data(size_t offset, size_t
|
|||
{
|
||||
// 1. Let length be node’s length.
|
||||
// FIXME: This is very inefficient!
|
||||
auto utf16_data = MUST(AK::utf8_to_utf16(m_data));
|
||||
Utf16View utf16_view { utf16_data };
|
||||
auto utf16_result = MUST(AK::utf8_to_utf16(m_data));
|
||||
Utf16View utf16_view { utf16_result };
|
||||
auto length = utf16_view.length_in_code_units();
|
||||
|
||||
// 2. If offset is greater than length, then throw an "IndexSizeError" DOMException.
|
||||
|
@ -84,12 +84,12 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
|
|||
// 6. Let delete offset be offset + data’s length.
|
||||
// 7. Starting from delete offset code units, remove count code units from node’s data.
|
||||
auto before_data = utf16_view.substring_view(0, offset);
|
||||
auto inserted_data = MUST(AK::utf8_to_utf16(data));
|
||||
auto inserted_data_result = MUST(AK::utf8_to_utf16(data));
|
||||
auto after_data = utf16_view.substring_view(offset + count);
|
||||
Utf16Data full_data;
|
||||
full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data.size() + after_data.length_in_code_units());
|
||||
full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units());
|
||||
full_data.append(before_data.data(), before_data.length_in_code_units());
|
||||
full_data.extend(inserted_data);
|
||||
full_data.extend(inserted_data_result.data);
|
||||
full_data.append(after_data.data(), after_data.length_in_code_units());
|
||||
Utf16View full_view { full_data };
|
||||
|
||||
|
@ -120,14 +120,14 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
|
|||
// 10. For each live range whose start node is node and start offset is greater than offset plus count, increase its start offset by data’s length and decrease it by count.
|
||||
for (auto& range : Range::live_ranges()) {
|
||||
if (range->start_container() == this && range->start_offset() > (offset + count))
|
||||
TRY(range->set_start(*range->start_container(), range->start_offset() + inserted_data.size() - count));
|
||||
TRY(range->set_start(*range->start_container(), range->start_offset() + inserted_data_result.data.size() - count));
|
||||
}
|
||||
|
||||
// 11. For each live range whose end node is node and end offset is greater than offset plus count, increase its end offset by data’s length and decrease it by count.
|
||||
for (auto& range : Range::live_ranges()) {
|
||||
if (range->end_container() == this && range->end_offset() > (offset + count)) {
|
||||
// AD-HOC: Clamp offset to the end of the data if it's too large.
|
||||
auto new_offset = min(range->end_offset() + inserted_data.size() - count, length_in_utf16_code_units());
|
||||
auto new_offset = min(range->end_offset() + inserted_data_result.data.size() - count, length_in_utf16_code_units());
|
||||
TRY(range->set_end(*range->end_container(), new_offset));
|
||||
}
|
||||
}
|
||||
|
|
|
@ -327,8 +327,8 @@ void URLSearchParams::sort()
|
|||
// 1. Sort all name-value pairs, if any, by their names. Sorting must be done by comparison of code units. The relative order between name-value pairs with equal names must be preserved.
|
||||
insertion_sort(m_list, [](auto& a, auto& b) {
|
||||
// FIXME: There should be a way to do this without converting to utf16
|
||||
auto a_utf16 = MUST(utf8_to_utf16(a.name));
|
||||
auto b_utf16 = MUST(utf8_to_utf16(b.name));
|
||||
auto a_utf16 = MUST(utf8_to_utf16(a.name)).data;
|
||||
auto b_utf16 = MUST(utf8_to_utf16(b.name)).data;
|
||||
|
||||
auto common_length = min(a_utf16.size(), b_utf16.size());
|
||||
|
||||
|
|
|
@ -54,7 +54,7 @@ ByteString SVGTextContentElement::text_contents() const
|
|||
// https://svgwg.org/svg2-draft/text.html#__svg__SVGTextContentElement__getNumberOfChars
|
||||
WebIDL::ExceptionOr<WebIDL::Long> SVGTextContentElement::get_number_of_chars() const
|
||||
{
|
||||
auto chars = TRY_OR_THROW_OOM(vm(), utf8_to_utf16(text_contents()));
|
||||
auto chars = TRY_OR_THROW_OOM(vm(), utf8_to_utf16(text_contents())).data;
|
||||
return static_cast<WebIDL::Long>(chars.size());
|
||||
}
|
||||
|
||||
|
|
|
@ -160,16 +160,16 @@ TEST_CASE(out_of_bounds)
|
|||
auto segmenter = Unicode::Segmenter::create(Unicode::SegmenterGranularity::Word);
|
||||
segmenter->set_segmented_text(Utf16View { text });
|
||||
|
||||
auto result = segmenter->previous_boundary(text.size() + 1);
|
||||
auto result = segmenter->previous_boundary(text.data.size() + 1);
|
||||
EXPECT(result.has_value());
|
||||
|
||||
result = segmenter->next_boundary(text.size() + 1);
|
||||
result = segmenter->next_boundary(text.data.size() + 1);
|
||||
EXPECT(!result.has_value());
|
||||
|
||||
result = segmenter->previous_boundary(text.size());
|
||||
result = segmenter->previous_boundary(text.data.size());
|
||||
EXPECT(result.has_value());
|
||||
|
||||
result = segmenter->next_boundary(text.size());
|
||||
result = segmenter->next_boundary(text.data.size());
|
||||
EXPECT(!result.has_value());
|
||||
|
||||
result = segmenter->next_boundary(0);
|
||||
|
|
Loading…
Add table
Reference in a new issue