diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index c6cd17668b7..cdf8b8a609b 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -37,35 +37,40 @@ static constexpr u16 host_code_unit(u16 code_unit, Endianness endianness) } template UtfViewType> -static ErrorOr to_utf16_slow(UtfViewType const& view, Endianness endianness) +static ErrorOr to_utf16_slow(UtfViewType const& view, Endianness endianness) { Utf16Data utf16_data; TRY(utf16_data.try_ensure_capacity(view.length())); - for (auto code_point : view) + size_t code_point_count = 0; + for (auto code_point : view) { TRY(code_point_to_utf16(utf16_data, code_point, endianness)); + code_point_count++; + } - return utf16_data; + return Utf16ConversionResult { move(utf16_data), code_point_count }; } -ErrorOr utf8_to_utf16(StringView utf8_view, Endianness endianness) +ErrorOr utf8_to_utf16(StringView utf8_view, Endianness endianness) { return utf8_to_utf16(Utf8View { utf8_view }, endianness); } -ErrorOr utf8_to_utf16(Utf8View const& utf8_view, Endianness endianness) +ErrorOr utf8_to_utf16(Utf8View const& utf8_view, Endianness endianness) { // All callers want to allow lonely surrogates, which simdutf does not permit. if (!utf8_view.validate(Utf8View::AllowSurrogates::No)) [[unlikely]] return to_utf16_slow(utf8_view, endianness); if (utf8_view.is_empty()) - return Utf16Data {}; + return Utf16ConversionResult { Utf16Data {}, 0 }; auto const* data = reinterpret_cast(utf8_view.bytes()); auto length = utf8_view.byte_length(); Utf16Data utf16_data; TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf8(data, length))); + // FIXME: simdutf _could_ be telling us about this, but it doesn't -- so we have to compute it again. + auto code_point_length = simdutf::count_utf8(data, length); [[maybe_unused]] auto result = [&]() { switch (endianness) { @@ -80,13 +85,13 @@ ErrorOr utf8_to_utf16(Utf8View const& utf8_view, Endianness endiannes }(); ASSERT(result == utf16_data.size()); - return utf16_data; + return Utf16ConversionResult { utf16_data, code_point_length }; } -ErrorOr utf32_to_utf16(Utf32View const& utf32_view, Endianness endianness) +ErrorOr utf32_to_utf16(Utf32View const& utf32_view, Endianness endianness) { if (utf32_view.is_empty()) - return Utf16Data {}; + return Utf16ConversionResult { Utf16Data {}, 0 }; auto const* data = reinterpret_cast(utf32_view.code_points()); auto length = utf32_view.length(); @@ -107,7 +112,7 @@ ErrorOr utf32_to_utf16(Utf32View const& utf32_view, Endianness endian }(); ASSERT(result == utf16_data.size()); - return utf16_data; + return Utf16ConversionResult { utf16_data, length }; } ErrorOr code_point_to_utf16(Utf16Data& string, u32 code_point, Endianness endianness) @@ -207,6 +212,9 @@ u32 Utf16View::code_point_at(size_t index) const size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const { + if (m_length_in_code_points == m_code_units.size()) // Fast path: all code points are one code unit. + return code_unit_offset; + size_t code_point_offset = 0; for (auto it = begin(); it != end(); ++it) { @@ -222,6 +230,9 @@ size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const { + if (m_length_in_code_points == m_code_units.size()) // Fast path: all code points are one code unit. + return code_point_offset; + size_t code_unit_offset = 0; for (auto it = begin(); it != end(); ++it) { @@ -256,6 +267,9 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod if (code_point_length == 0) return {}; + if (m_length_in_code_points == m_code_units.size()) // Fast path: all code points are one code unit. + return substring_view(code_point_offset, code_point_length); + auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) { return it.m_ptr - begin_ptr(); }; size_t code_point_index = 0; size_t code_unit_offset = 0; diff --git a/AK/Utf16View.h b/AK/Utf16View.h index 2c69f89b6e7..9be66ced810 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -21,9 +21,13 @@ namespace AK { using Utf16Data = Vector; -ErrorOr utf8_to_utf16(StringView, Endianness = Endianness::Host); -ErrorOr utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host); -ErrorOr utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host); +struct Utf16ConversionResult { + Utf16Data data; + size_t code_point_count; +}; +ErrorOr utf8_to_utf16(StringView, Endianness = Endianness::Host); +ErrorOr utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host); +ErrorOr utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host); ErrorOr code_point_to_utf16(Utf16Data&, u32, Endianness = Endianness::Host); [[nodiscard]] bool validate_utf16_le(ReadonlyBytes); @@ -77,6 +81,13 @@ public: { } + Utf16View(Utf16ConversionResult&&) = delete; + explicit Utf16View(Utf16ConversionResult const& conversion_result) + : m_code_units(conversion_result.data) + , m_length_in_code_points(conversion_result.code_point_count) + { + } + template Utf16View(char16_t const (&code_units)[Size]) : m_code_units( @@ -95,6 +106,8 @@ public: ErrorOr to_byte_string(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; ErrorOr to_utf8(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; + void unsafe_set_code_point_length(size_t length) const { m_length_in_code_points = length; } + bool is_null() const { return m_code_units.is_null(); } bool is_empty() const { return m_code_units.is_empty(); } size_t length_in_code_units() const { return m_code_units.size(); } diff --git a/Libraries/LibJS/Runtime/GlobalObject.cpp b/Libraries/LibJS/Runtime/GlobalObject.cpp index c4bc2b80d18..1671fcfb83b 100644 --- a/Libraries/LibJS/Runtime/GlobalObject.cpp +++ b/Libraries/LibJS/Runtime/GlobalObject.cpp @@ -572,7 +572,8 @@ JS_DEFINE_NATIVE_FUNCTION(GlobalObject::escape) // 2. Let length be the length of string. // 5. Let k be 0. // 6. Repeat, while k < length, - for (auto code_point : TRY_OR_THROW_OOM(vm, utf8_to_utf16(string))) { + auto utf16_conversion = TRY_OR_THROW_OOM(vm, utf8_to_utf16(string)); + for (auto code_point : utf16_conversion.data) { // a. Let char be the code unit at index k within string. // b. If unescapedSet contains char, then diff --git a/Libraries/LibJS/Runtime/RegExpObject.cpp b/Libraries/LibJS/Runtime/RegExpObject.cpp index 59f331f1965..817095a4468 100644 --- a/Libraries/LibJS/Runtime/RegExpObject.cpp +++ b/Libraries/LibJS/Runtime/RegExpObject.cpp @@ -97,8 +97,8 @@ ErrorOr parse_regex_pattern(StringView pattern, if (utf16_pattern_result.is_error()) return ParseRegexPatternError { "Out of memory"_string }; - auto utf16_pattern = utf16_pattern_result.release_value(); - Utf16View utf16_pattern_view { utf16_pattern }; + auto utf16_result = utf16_pattern_result.release_value(); + Utf16View utf16_pattern_view { utf16_result }; StringBuilder builder; // If the Unicode flag is set, append each code point to the pattern. Otherwise, append each diff --git a/Libraries/LibJS/Runtime/Utf16String.cpp b/Libraries/LibJS/Runtime/Utf16String.cpp index 27a24f175cd..5b7acc96de6 100644 --- a/Libraries/LibJS/Runtime/Utf16String.cpp +++ b/Libraries/LibJS/Runtime/Utf16String.cpp @@ -34,7 +34,10 @@ NonnullRefPtr Utf16StringImpl::create(Utf16Data string) NonnullRefPtr Utf16StringImpl::create(StringView string) { - return create(MUST(utf8_to_utf16(string))); + auto result = MUST(utf8_to_utf16(string)); + auto impl = create(move(result.data)); + impl->m_cached_view.unsafe_set_code_point_length(result.code_point_count); + return impl; } NonnullRefPtr Utf16StringImpl::create(Utf16View const& view) @@ -42,7 +45,9 @@ NonnullRefPtr Utf16StringImpl::create(Utf16View const& view) Utf16Data string; string.ensure_capacity(view.length_in_code_units()); string.unchecked_append(view.data(), view.length_in_code_units()); - return create(move(string)); + auto impl = create(move(string)); + impl->m_cached_view.unsafe_set_code_point_length(view.length_in_code_units()); + return impl; } Utf16Data const& Utf16StringImpl::string() const @@ -52,7 +57,7 @@ Utf16Data const& Utf16StringImpl::string() const Utf16View Utf16StringImpl::view() const { - return Utf16View { m_string }; + return m_cached_view; } u32 Utf16StringImpl::compute_hash() const diff --git a/Libraries/LibJS/Runtime/Utf16String.h b/Libraries/LibJS/Runtime/Utf16String.h index 0042a4dd81c..5892daa64a7 100644 --- a/Libraries/LibJS/Runtime/Utf16String.h +++ b/Libraries/LibJS/Runtime/Utf16String.h @@ -48,6 +48,7 @@ private: mutable bool m_has_hash { false }; mutable u32 m_hash { 0 }; Utf16Data m_string; + Utf16View m_cached_view { m_string.span() }; }; } diff --git a/Libraries/LibRegex/RegexMatch.h b/Libraries/LibRegex/RegexMatch.h index 6da85b57f4d..3d21bf305cf 100644 --- a/Libraries/LibRegex/RegexMatch.h +++ b/Libraries/LibRegex/RegexMatch.h @@ -113,16 +113,19 @@ public: RegexStringView construct_as_same(Span data, Optional& optional_string_storage, Utf16Data& optional_utf16_storage) const { auto view = m_view.visit( - [&](T const&) { + [&optional_string_storage, data](T const&) { StringBuilder builder; for (auto ch : data) builder.append(ch); // Note: The type conversion is intentional. optional_string_storage = builder.to_byte_string(); return RegexStringView { T { *optional_string_storage } }; }, - [&](Utf16View) { - optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() }).release_value_but_fixme_should_propagate_errors(); - return RegexStringView { Utf16View { optional_utf16_storage } }; + [&optional_utf16_storage, data](Utf16View) { + auto conversion_result = utf32_to_utf16(Utf32View { data.data(), data.size() }).release_value_but_fixme_should_propagate_errors(); + optional_utf16_storage = conversion_result.data; + auto view = Utf16View { optional_utf16_storage }; + view.unsafe_set_code_point_length(conversion_result.code_point_count); + return RegexStringView { view }; }); view.set_unicode(unicode()); diff --git a/Libraries/LibWeb/DOM/CharacterData.cpp b/Libraries/LibWeb/DOM/CharacterData.cpp index 608180609d7..d8a89a149ef 100644 --- a/Libraries/LibWeb/DOM/CharacterData.cpp +++ b/Libraries/LibWeb/DOM/CharacterData.cpp @@ -46,8 +46,8 @@ WebIDL::ExceptionOr CharacterData::substring_data(size_t offset, size_t { // 1. Let length be node’s length. // FIXME: This is very inefficient! - auto utf16_data = MUST(AK::utf8_to_utf16(m_data)); - Utf16View utf16_view { utf16_data }; + auto utf16_result = MUST(AK::utf8_to_utf16(m_data)); + Utf16View utf16_view { utf16_result }; auto length = utf16_view.length_in_code_units(); // 2. If offset is greater than length, then throw an "IndexSizeError" DOMException. @@ -84,12 +84,12 @@ WebIDL::ExceptionOr CharacterData::replace_data(size_t offset, size_t coun // 6. Let delete offset be offset + data’s length. // 7. Starting from delete offset code units, remove count code units from node’s data. auto before_data = utf16_view.substring_view(0, offset); - auto inserted_data = MUST(AK::utf8_to_utf16(data)); + auto inserted_data_result = MUST(AK::utf8_to_utf16(data)); auto after_data = utf16_view.substring_view(offset + count); Utf16Data full_data; - full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data.size() + after_data.length_in_code_units()); + full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units()); full_data.append(before_data.data(), before_data.length_in_code_units()); - full_data.extend(inserted_data); + full_data.extend(inserted_data_result.data); full_data.append(after_data.data(), after_data.length_in_code_units()); Utf16View full_view { full_data }; @@ -120,14 +120,14 @@ WebIDL::ExceptionOr CharacterData::replace_data(size_t offset, size_t coun // 10. For each live range whose start node is node and start offset is greater than offset plus count, increase its start offset by data’s length and decrease it by count. for (auto& range : Range::live_ranges()) { if (range->start_container() == this && range->start_offset() > (offset + count)) - TRY(range->set_start(*range->start_container(), range->start_offset() + inserted_data.size() - count)); + TRY(range->set_start(*range->start_container(), range->start_offset() + inserted_data_result.data.size() - count)); } // 11. For each live range whose end node is node and end offset is greater than offset plus count, increase its end offset by data’s length and decrease it by count. for (auto& range : Range::live_ranges()) { if (range->end_container() == this && range->end_offset() > (offset + count)) { // AD-HOC: Clamp offset to the end of the data if it's too large. - auto new_offset = min(range->end_offset() + inserted_data.size() - count, length_in_utf16_code_units()); + auto new_offset = min(range->end_offset() + inserted_data_result.data.size() - count, length_in_utf16_code_units()); TRY(range->set_end(*range->end_container(), new_offset)); } } diff --git a/Libraries/LibWeb/DOMURL/URLSearchParams.cpp b/Libraries/LibWeb/DOMURL/URLSearchParams.cpp index 2589fe9699e..76f2fb6aca5 100644 --- a/Libraries/LibWeb/DOMURL/URLSearchParams.cpp +++ b/Libraries/LibWeb/DOMURL/URLSearchParams.cpp @@ -327,8 +327,8 @@ void URLSearchParams::sort() // 1. Sort all name-value pairs, if any, by their names. Sorting must be done by comparison of code units. The relative order between name-value pairs with equal names must be preserved. insertion_sort(m_list, [](auto& a, auto& b) { // FIXME: There should be a way to do this without converting to utf16 - auto a_utf16 = MUST(utf8_to_utf16(a.name)); - auto b_utf16 = MUST(utf8_to_utf16(b.name)); + auto a_utf16 = MUST(utf8_to_utf16(a.name)).data; + auto b_utf16 = MUST(utf8_to_utf16(b.name)).data; auto common_length = min(a_utf16.size(), b_utf16.size()); diff --git a/Libraries/LibWeb/SVG/SVGTextContentElement.cpp b/Libraries/LibWeb/SVG/SVGTextContentElement.cpp index cacaa107745..32343bd6f34 100644 --- a/Libraries/LibWeb/SVG/SVGTextContentElement.cpp +++ b/Libraries/LibWeb/SVG/SVGTextContentElement.cpp @@ -54,7 +54,7 @@ ByteString SVGTextContentElement::text_contents() const // https://svgwg.org/svg2-draft/text.html#__svg__SVGTextContentElement__getNumberOfChars WebIDL::ExceptionOr SVGTextContentElement::get_number_of_chars() const { - auto chars = TRY_OR_THROW_OOM(vm(), utf8_to_utf16(text_contents())); + auto chars = TRY_OR_THROW_OOM(vm(), utf8_to_utf16(text_contents())).data; return static_cast(chars.size()); } diff --git a/Tests/LibUnicode/TestSegmenter.cpp b/Tests/LibUnicode/TestSegmenter.cpp index 13368ab8451..ccb17bd26ad 100644 --- a/Tests/LibUnicode/TestSegmenter.cpp +++ b/Tests/LibUnicode/TestSegmenter.cpp @@ -160,16 +160,16 @@ TEST_CASE(out_of_bounds) auto segmenter = Unicode::Segmenter::create(Unicode::SegmenterGranularity::Word); segmenter->set_segmented_text(Utf16View { text }); - auto result = segmenter->previous_boundary(text.size() + 1); + auto result = segmenter->previous_boundary(text.data.size() + 1); EXPECT(result.has_value()); - result = segmenter->next_boundary(text.size() + 1); + result = segmenter->next_boundary(text.data.size() + 1); EXPECT(!result.has_value()); - result = segmenter->previous_boundary(text.size()); + result = segmenter->previous_boundary(text.data.size()); EXPECT(result.has_value()); - result = segmenter->next_boundary(text.size()); + result = segmenter->next_boundary(text.data.size()); EXPECT(!result.has_value()); result = segmenter->next_boundary(0);