From 9582895759675b612bbd9fbadf6ce205db3b592d Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 9 Jul 2025 14:13:38 -0400 Subject: [PATCH] AK+LibJS+LibWeb+LibRegex: Replace AK::Utf16Data with AK::Utf16String --- AK/String.cpp | 9 +-- AK/String.h | 1 - AK/Utf16View.cpp | 65 ------------------- AK/Utf16View.h | 39 ----------- Libraries/LibCore/ProcessWindows.cpp | 2 +- Libraries/LibJS/Runtime/GlobalObject.cpp | 16 ++--- Libraries/LibJS/Runtime/RegExpObject.cpp | 13 ++-- Libraries/LibRegex/RegexByteCode.cpp | 2 +- Libraries/LibRegex/RegexMatch.h | 12 ++-- Libraries/LibWeb/DOM/CharacterData.cpp | 32 +++++---- Libraries/LibWeb/DOM/Document.cpp | 13 ++-- .../LibWeb/Editing/Internal/Algorithms.cpp | 26 ++++---- Libraries/LibWeb/FileAPI/FileReader.cpp | 7 +- Libraries/LibWeb/Infra/Strings.cpp | 14 ++-- Libraries/LibWeb/Layout/Viewport.cpp | 14 ++-- Libraries/LibWeb/Layout/Viewport.h | 3 +- .../LibWeb/Painting/PaintableFragment.cpp | 5 +- Libraries/LibWeb/Painting/PaintableFragment.h | 3 +- .../LibWeb/SVG/SVGTextContentElement.cpp | 4 +- Tests/AK/TestUtf16View.cpp | 26 ++++---- Tests/LibRegex/TestRegex.cpp | 4 +- Tests/LibUnicode/TestSegmenter.cpp | 13 ++-- 22 files changed, 101 insertions(+), 222 deletions(-) diff --git a/AK/String.cpp b/AK/String.cpp index 00162a8aa37..8e1a744ac4c 100644 --- a/AK/String.cpp +++ b/AK/String.cpp @@ -67,11 +67,6 @@ ErrorOr String::from_utf8(StringView view) return result; } -ErrorOr String::from_utf16(Utf16View const& utf16) -{ - return utf16.to_utf8(); -} - ErrorOr String::from_utf16_le_with_replacement_character(ReadonlyBytes bytes) { if (bytes.is_empty()) @@ -80,7 +75,7 @@ ErrorOr String::from_utf16_le_with_replacement_character(ReadonlyBytes b auto const* utf16_data = reinterpret_cast(bytes.data()); auto utf16_length = bytes.size() / 2; - Utf16Data well_formed_utf16; + Vector well_formed_utf16; if (!validate_utf16_le(bytes)) { well_formed_utf16.resize(bytes.size()); @@ -109,7 +104,7 @@ ErrorOr String::from_utf16_be_with_replacement_character(ReadonlyBytes b auto const* utf16_data = reinterpret_cast(bytes.data()); auto utf16_length = bytes.size() / 2; - Utf16Data well_formed_utf16; + Vector well_formed_utf16; if (!validate_utf16_le(bytes)) { well_formed_utf16.resize(bytes.size()); diff --git a/AK/String.h b/AK/String.h index f30fbb6e405..639fa3160e3 100644 --- a/AK/String.h +++ b/AK/String.h @@ -69,7 +69,6 @@ public: [[nodiscard]] static String from_string_builder_without_validation(Badge, StringBuilder&); // Creates a new String from a sequence of UTF-16 encoded code points. - static ErrorOr from_utf16(Utf16View const&); static ErrorOr from_utf16_le_with_replacement_character(ReadonlyBytes); static ErrorOr from_utf16_be_with_replacement_character(ReadonlyBytes); diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index ceb6e9c2cd4..995eee1754b 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -10,77 +10,12 @@ #include #include #include -#include #include #include namespace AK { -template UtfViewType> -static ErrorOr to_utf16_slow(UtfViewType const& view) -{ - Utf16Data utf16_data; - TRY(utf16_data.try_ensure_capacity(view.length())); - - size_t code_point_count = 0; - for (auto code_point : view) { - TRY(UnicodeUtils::try_code_point_to_utf16(code_point, [&](auto code_unit) -> ErrorOr { - TRY(utf16_data.try_append(code_unit)); - return {}; - })); - - code_point_count++; - } - - return Utf16ConversionResult { move(utf16_data), code_point_count }; -} - -ErrorOr utf8_to_utf16(StringView utf8_view) -{ - return utf8_to_utf16(Utf8View { utf8_view }); -} - -ErrorOr utf8_to_utf16(Utf8View const& utf8_view) -{ - if (utf8_view.is_empty()) - return Utf16ConversionResult { Utf16Data {}, 0 }; - - // All callers want to allow lonely surrogates, which simdutf does not permit. - if (!utf8_view.validate(AllowLonelySurrogates::No)) [[unlikely]] - return to_utf16_slow(utf8_view); - - auto const* data = reinterpret_cast(utf8_view.bytes()); - auto length = utf8_view.byte_length(); - - Utf16Data utf16_data; - TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf8(data, length))); - // FIXME: simdutf _could_ be telling us about this, but it doesn't -- so we have to compute it again. - auto code_point_length = simdutf::count_utf8(data, length); - - [[maybe_unused]] auto result = simdutf::convert_utf8_to_utf16(data, length, reinterpret_cast(utf16_data.data())); - ASSERT(result == utf16_data.size()); - - return Utf16ConversionResult { utf16_data, code_point_length }; -} - -ErrorOr utf32_to_utf16(Utf32View const& utf32_view) -{ - if (utf32_view.is_empty()) - return Utf16ConversionResult { Utf16Data {}, 0 }; - - auto const* data = reinterpret_cast(utf32_view.code_points()); - auto length = utf32_view.length(); - - Utf16Data utf16_data; - TRY(utf16_data.try_resize(simdutf::utf16_length_from_utf32(data, length))); - - [[maybe_unused]] auto result = simdutf::convert_utf32_to_utf16(data, length, reinterpret_cast(utf16_data.data())); - ASSERT(result == utf16_data.size()); - - return Utf16ConversionResult { utf16_data, length }; -} - bool validate_utf16_le(ReadonlyBytes bytes) { return simdutf::validate_utf16le(reinterpret_cast(bytes.data()), bytes.size() / 2); diff --git a/AK/Utf16View.h b/AK/Utf16View.h index 49996a5bf5f..eec2d62eba1 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -23,16 +23,6 @@ namespace AK { -using Utf16Data = Vector; - -struct Utf16ConversionResult { - Utf16Data data; - size_t code_point_count; -}; -ErrorOr utf8_to_utf16(StringView); -ErrorOr utf8_to_utf16(Utf8View const&); -ErrorOr utf32_to_utf16(Utf32View const&); - [[nodiscard]] bool validate_utf16_le(ReadonlyBytes); [[nodiscard]] bool validate_utf16_be(ReadonlyBytes); @@ -156,13 +146,6 @@ public: m_length_in_code_units |= 1uz << Detail::UTF16_FLAG; } - constexpr Utf16View(Utf16Data const& string) - : m_string { .utf16 = string.data() } - , m_length_in_code_units(string.size()) - { - m_length_in_code_units |= 1uz << Detail::UTF16_FLAG; - } - consteval Utf16View(StringView string) : m_string { .ascii = string.characters_without_null_termination() } , m_length_in_code_units(string.length()) @@ -170,15 +153,6 @@ public: VERIFY(all_of(string, AK::is_ascii)); } - Utf16View(Utf16ConversionResult&&) = delete; - explicit Utf16View(Utf16ConversionResult const& conversion_result) - : m_string { .utf16 = conversion_result.data.data() } - , m_length_in_code_units(conversion_result.data.size()) - , m_length_in_code_points(conversion_result.code_point_count) - { - m_length_in_code_units |= 1uz << Detail::UTF16_FLAG; - } - ErrorOr to_utf8(AllowLonelySurrogates = AllowLonelySurrogates::Yes) const; ErrorOr to_byte_string(AllowLonelySurrogates = AllowLonelySurrogates::Yes) const; @@ -314,18 +288,6 @@ public: return m_length_in_code_points; } - constexpr Optional length_in_code_points_if_known() const - { - if (has_ascii_storage()) - return m_length_in_code_units; - - if (m_length_in_code_points == NumericLimits::max()) - return {}; - return m_length_in_code_points; - } - - constexpr void unsafe_set_code_point_length(size_t length) const { m_length_in_code_points = length; } - [[nodiscard]] constexpr char16_t code_unit_at(size_t index) const { VERIFY(index < length_in_code_units()); @@ -591,6 +553,5 @@ inline constexpr bool IsHashCompatible = true; } #if USING_AK_GLOBALLY -using AK::Utf16Data; using AK::Utf16View; #endif diff --git a/Libraries/LibCore/ProcessWindows.cpp b/Libraries/LibCore/ProcessWindows.cpp index d66b7989da4..e2070e64658 100644 --- a/Libraries/LibCore/ProcessWindows.cpp +++ b/Libraries/LibCore/ProcessWindows.cpp @@ -111,7 +111,7 @@ ErrorOr Process::get_name() if (!length) return Error::from_windows_error(); - return String::from_utf16(Utf16View { reinterpret_cast(path), length }); + return MUST(Utf16View { reinterpret_cast(path), length }.to_utf8()); } ErrorOr Process::set_name(StringView, SetThreadName) diff --git a/Libraries/LibJS/Runtime/GlobalObject.cpp b/Libraries/LibJS/Runtime/GlobalObject.cpp index 6cfb243e82b..090a9a66d34 100644 --- a/Libraries/LibJS/Runtime/GlobalObject.cpp +++ b/Libraries/LibJS/Runtime/GlobalObject.cpp @@ -559,7 +559,7 @@ JS_DEFINE_NATIVE_FUNCTION(GlobalObject::encode_uri_component) JS_DEFINE_NATIVE_FUNCTION(GlobalObject::escape) { // 1. Set string to ? ToString(string). - auto string = TRY(vm.argument(0).to_byte_string(vm)); + auto string = TRY(vm.argument(0).to_utf16_string(vm)); // 3. Let R be the empty String. StringBuilder escaped; @@ -570,29 +570,29 @@ JS_DEFINE_NATIVE_FUNCTION(GlobalObject::escape) // 2. Let length be the length of string. // 5. Let k be 0. // 6. Repeat, while k < length, - auto utf16_conversion = TRY_OR_THROW_OOM(vm, utf8_to_utf16(string)); - for (auto code_point : utf16_conversion.data) { + for (size_t k = 0; k < string.length_in_code_units(); ++k) { // a. Let char be the code unit at index k within string. + auto code_unit = string.code_unit_at(k); // b. If unescapedSet contains char, then // NOTE: We know unescapedSet is ASCII-only, so ensure we have an ASCII codepoint before casting to char. - if (is_ascii(code_point) && unescaped_set.contains(static_cast(code_point))) { + if (is_ascii(code_unit) && unescaped_set.contains(static_cast(code_unit))) { // i. Let S be the String value containing the single code unit char. - escaped.append(code_point); + escaped.append(static_cast(code_unit)); } // c. Else, // i. Let n be the numeric value of char. // ii. If n < 256, then - else if (code_point < 256) { + else if (code_unit < 256) { // 1. Let hex be the String representation of n, formatted as an uppercase hexadecimal number. // 2. Let S be the string-concatenation of "%" and ! StringPad(hex, 2𝔽, "0", start). - escaped.appendff("%{:02X}", code_point); + escaped.appendff("%{:02X}", code_unit); } // iii. Else, else { // 1. Let hex be the String representation of n, formatted as an uppercase hexadecimal number. // 2. Let S be the string-concatenation of "%u" and ! StringPad(hex, 4𝔽, "0", start). - escaped.appendff("%u{:04X}", code_point); + escaped.appendff("%u{:04X}", code_unit); } // d. Set R to the string-concatenation of R and S. diff --git a/Libraries/LibJS/Runtime/RegExpObject.cpp b/Libraries/LibJS/Runtime/RegExpObject.cpp index 817095a4468..0d87d35146d 100644 --- a/Libraries/LibJS/Runtime/RegExpObject.cpp +++ b/Libraries/LibJS/Runtime/RegExpObject.cpp @@ -93,26 +93,21 @@ ErrorOr parse_regex_pattern(StringView pattern, if (unicode && unicode_sets) return ParseRegexPatternError { MUST(String::formatted(ErrorType::RegExpObjectIncompatibleFlags.message(), 'u', 'v')) }; - auto utf16_pattern_result = AK::utf8_to_utf16(pattern); - if (utf16_pattern_result.is_error()) - return ParseRegexPatternError { "Out of memory"_string }; - - auto utf16_result = utf16_pattern_result.release_value(); - Utf16View utf16_pattern_view { utf16_result }; + auto utf16_pattern = Utf16String::from_utf8(pattern); StringBuilder builder; // If the Unicode flag is set, append each code point to the pattern. Otherwise, append each // code unit. But unlike the spec, multi-byte code units must be escaped for LibRegex to parse. auto previous_code_unit_was_backslash = false; - for (size_t i = 0; i < utf16_pattern_view.length_in_code_units();) { + for (size_t i = 0; i < utf16_pattern.length_in_code_units();) { if (unicode || unicode_sets) { - auto code_point = code_point_at(utf16_pattern_view, i); + auto code_point = code_point_at(utf16_pattern, i); builder.append_code_point(code_point.code_point); i += code_point.code_unit_count; continue; } - u16 code_unit = utf16_pattern_view.code_unit_at(i); + u16 code_unit = utf16_pattern.code_unit_at(i); ++i; if (code_unit > 0x7f) { diff --git a/Libraries/LibRegex/RegexByteCode.cpp b/Libraries/LibRegex/RegexByteCode.cpp index a3e272cc940..8cbef6af595 100644 --- a/Libraries/LibRegex/RegexByteCode.cpp +++ b/Libraries/LibRegex/RegexByteCode.cpp @@ -512,7 +512,7 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M return ExecutionResult::Failed_ExecuteLowPrioForks; Optional str; - Utf16Data utf16; + Utf16String utf16; Vector data; data.ensure_capacity(length); for (size_t i = offset; i < offset + length; ++i) diff --git a/Libraries/LibRegex/RegexMatch.h b/Libraries/LibRegex/RegexMatch.h index b5ed2d5ca80..ddbc36fee83 100644 --- a/Libraries/LibRegex/RegexMatch.h +++ b/Libraries/LibRegex/RegexMatch.h @@ -8,14 +8,15 @@ #include "Forward.h" #include "RegexOptions.h" -#include #include #include +#include #include #include #include #include +#include #include #include #include @@ -110,7 +111,7 @@ public: return view; } - RegexStringView construct_as_same(Span data, Optional& optional_string_storage, Utf16Data& optional_utf16_storage) const + RegexStringView construct_as_same(Span data, Optional& optional_string_storage, Utf16String& optional_utf16_storage) const { auto view = m_view.visit( [&optional_string_storage, data](T const&) { @@ -121,11 +122,8 @@ public: return RegexStringView { T { *optional_string_storage } }; }, [&optional_utf16_storage, data](Utf16View) { - auto conversion_result = utf32_to_utf16(Utf32View { data.data(), data.size() }).release_value_but_fixme_should_propagate_errors(); - optional_utf16_storage = conversion_result.data; - auto view = Utf16View { optional_utf16_storage }; - view.unsafe_set_code_point_length(conversion_result.code_point_count); - return RegexStringView { view }; + optional_utf16_storage = Utf16String::from_utf32({ data.data(), data.size() }); + return RegexStringView { optional_utf16_storage.utf16_view() }; }); view.set_unicode(unicode()); diff --git a/Libraries/LibWeb/DOM/CharacterData.cpp b/Libraries/LibWeb/DOM/CharacterData.cpp index 72eacdfb157..6f685fe0424 100644 --- a/Libraries/LibWeb/DOM/CharacterData.cpp +++ b/Libraries/LibWeb/DOM/CharacterData.cpp @@ -46,9 +46,8 @@ WebIDL::ExceptionOr CharacterData::substring_data(size_t offset, size_t { // 1. Let length be node’s length. // FIXME: This is very inefficient! - auto utf16_result = MUST(AK::utf8_to_utf16(m_data)); - Utf16View utf16_view { utf16_result }; - auto length = utf16_view.length_in_code_units(); + auto utf16_string = Utf16String::from_utf8(m_data); + auto length = utf16_string.length_in_code_units(); // 2. If offset is greater than length, then throw an "IndexSizeError" DOMException. if (offset > length) @@ -57,10 +56,10 @@ WebIDL::ExceptionOr CharacterData::substring_data(size_t offset, size_t // 3. If offset plus count is greater than length, return a string whose value is the code units from the offsetth code unit // to the end of node’s data, and then return. if (offset + count > length) - return MUST(utf16_view.substring_view(offset).to_utf8()); + return MUST(utf16_string.substring_view(offset).to_utf8()); // 4. Return a string whose value is the code units from the offsetth code unit to the offset+countth code unit in node’s data. - return MUST(utf16_view.substring_view(offset, count).to_utf8()); + return MUST(utf16_string.substring_view(offset, count).to_utf8()); } // https://dom.spec.whatwg.org/#concept-cd-replace @@ -68,9 +67,8 @@ WebIDL::ExceptionOr CharacterData::replace_data(size_t offset, size_t coun { // 1. Let length be node’s length. // FIXME: This is very inefficient! - auto utf16_data = MUST(AK::utf8_to_utf16(m_data)); - Utf16View utf16_view { utf16_data }; - auto length = utf16_view.length_in_code_units(); + auto utf16_string = Utf16String::from_utf8(m_data); + auto length = utf16_string.length_in_code_units(); // 2. If offset is greater than length, then throw an "IndexSizeError" DOMException. if (offset > length) @@ -83,17 +81,17 @@ WebIDL::ExceptionOr CharacterData::replace_data(size_t offset, size_t coun // 5. Insert data into node’s data after offset code units. // 6. Let delete offset be offset + data’s length. // 7. Starting from delete offset code units, remove count code units from node’s data. - auto before_data = utf16_view.substring_view(0, offset); - auto inserted_data_result = MUST(AK::utf8_to_utf16(data)); - auto after_data = utf16_view.substring_view(offset + count); + auto before_data = utf16_string.substring_view(0, offset); + auto inserted_data = Utf16String::from_utf8(data); + auto after_data = utf16_string.substring_view(offset + count); - StringBuilder full_data(StringBuilder::Mode::UTF16, before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units()); + StringBuilder full_data(StringBuilder::Mode::UTF16, before_data.length_in_code_units() + inserted_data.length_in_code_units() + after_data.length_in_code_units()); full_data.append(before_data); - full_data.append(inserted_data_result.data); + full_data.append(inserted_data); full_data.append(after_data); - auto full_view = full_data.utf16_string_view(); - bool characters_are_the_same = utf16_view == full_view; + auto full_view = full_data.utf16_string_view(); + bool characters_are_the_same = utf16_string == full_view; auto old_data = m_data; // OPTIMIZATION: Skip UTF-8 encoding if the characters are the same. @@ -123,14 +121,14 @@ WebIDL::ExceptionOr CharacterData::replace_data(size_t offset, size_t coun // start offset by data’s length and decrease it by count. for (auto* range : Range::live_ranges()) { if (range->start_container() == this && range->start_offset() > (offset + count)) - range->set_start_offset(range->start_offset() + inserted_data_result.data.size() - count); + range->set_start_offset(range->start_offset() + inserted_data.length_in_code_units() - count); } // 11. For each live range whose end node is node and end offset is greater than offset plus count, increase its end // offset by data’s length and decrease it by count. for (auto* range : Range::live_ranges()) { if (range->end_container() == this && range->end_offset() > (offset + count)) - range->set_end_offset(range->end_offset() + inserted_data_result.data.size() - count); + range->set_end_offset(range->end_offset() + inserted_data.length_in_code_units() - count); } // 12. If node’s parent is non-null, then run the children changed steps for node’s parent. diff --git a/Libraries/LibWeb/DOM/Document.cpp b/Libraries/LibWeb/DOM/Document.cpp index 1a7524d1d4a..1d83b99c1cc 100644 --- a/Libraries/LibWeb/DOM/Document.cpp +++ b/Libraries/LibWeb/DOM/Document.cpp @@ -6158,8 +6158,7 @@ Vector> Document::find_matching_text(String const& query, CaseSe if (text_blocks.is_empty()) return {}; - auto utf16_query = MUST(AK::utf8_to_utf16(query)); - Utf16View query_view { utf16_query }; + auto utf16_query = Utf16String::from_utf8(query); Vector> matches; for (auto const& text_block : text_blocks) { @@ -6169,8 +6168,8 @@ Vector> Document::find_matching_text(String const& query, CaseSe auto* match_start_position = text_block.positions.data(); while (true) { auto match_index = case_sensitivity == CaseSensitivity::CaseInsensitive - ? text_view.find_code_unit_offset_ignoring_case(query_view, offset) - : text_view.find_code_unit_offset(query_view, offset); + ? text_view.find_code_unit_offset_ignoring_case(utf16_query, offset) + : text_view.find_code_unit_offset(utf16_query, offset); if (!match_index.has_value()) break; @@ -6181,15 +6180,15 @@ Vector> Document::find_matching_text(String const& query, CaseSe auto& start_dom_node = match_start_position->dom_node; auto* match_end_position = match_start_position; - for (; i < text_block.positions.size() - 1 && (match_index.value() + query_view.length_in_code_units() > text_block.positions[i + 1].start_offset); ++i) + for (; i < text_block.positions.size() - 1 && (match_index.value() + utf16_query.length_in_code_units() > text_block.positions[i + 1].start_offset); ++i) match_end_position = &text_block.positions[i + 1]; auto& end_dom_node = match_end_position->dom_node; - auto end_position = match_index.value() + query_view.length_in_code_units() - match_end_position->start_offset; + auto end_position = match_index.value() + utf16_query.length_in_code_units() - match_end_position->start_offset; matches.append(Range::create(start_dom_node, start_position, end_dom_node, end_position)); match_start_position = match_end_position; - offset = match_index.value() + query_view.length_in_code_units() + 1; + offset = match_index.value() + utf16_query.length_in_code_units() + 1; if (offset >= text_view.length_in_code_units()) break; } diff --git a/Libraries/LibWeb/Editing/Internal/Algorithms.cpp b/Libraries/LibWeb/Editing/Internal/Algorithms.cpp index cc8d165d5ab..1ad0579e9f1 100644 --- a/Libraries/LibWeb/Editing/Internal/Algorithms.cpp +++ b/Libraries/LibWeb/Editing/Internal/Algorithms.cpp @@ -384,9 +384,9 @@ void canonicalize_whitespace(DOM::BoundaryPoint boundary, bool fix_collapsed_spa auto parent_white_space_collapse = resolved_keyword(*start_node->parent(), CSS::PropertyID::WhiteSpaceCollapse); // FIXME: Find a way to get code points directly from the UTF-8 string - auto start_node_data = *start_node->text_content(); - auto utf16_code_units = MUST(AK::utf8_to_utf16(start_node_data)); - auto offset_minus_one_code_point = Utf16View { utf16_code_units }.code_point_at(start_offset - 1); + auto start_node_data = Utf16String::from_utf8(*start_node->text_content()); + auto offset_minus_one_code_point = start_node_data.code_point_at(start_offset - 1); + if (parent_white_space_collapse != CSS::Keyword::Preserve && (offset_minus_one_code_point == 0x20 || offset_minus_one_code_point == 0xA0)) { --start_offset; continue; @@ -437,9 +437,9 @@ void canonicalize_whitespace(DOM::BoundaryPoint boundary, bool fix_collapsed_spa auto parent_white_space_collapse = resolved_keyword(*end_node->parent(), CSS::PropertyID::WhiteSpaceCollapse); // FIXME: Find a way to get code points directly from the UTF-8 string - auto end_node_data = *end_node->text_content(); - auto utf16_code_units = MUST(AK::utf8_to_utf16(end_node_data)); - auto offset_code_point = Utf16View { utf16_code_units }.code_point_at(end_offset); + auto end_node_data = Utf16String::from_utf8(*end_node->text_content()); + auto offset_code_point = end_node_data.code_point_at(end_offset); + if (parent_white_space_collapse != CSS::Keyword::Preserve && (offset_code_point == 0x20 || offset_code_point == 0xA0)) { // 1. If fix collapsed space is true, and collapse spaces is true, and the end offsetth // code unit of end node's data is a space (0x0020): call deleteData(end offset, 1) @@ -556,16 +556,14 @@ void canonicalize_whitespace(DOM::BoundaryPoint boundary, bool fix_collapsed_spa // 1. Remove the first code unit from replacement whitespace, and let element be that // code unit. // FIXME: Find a way to get code points directly from the UTF-8 string - auto replacement_whitespace_utf16 = MUST(AK::utf8_to_utf16(replacement_whitespace)); - auto replacement_whitespace_utf16_view = Utf16View { replacement_whitespace_utf16 }; - replacement_whitespace = MUST(String::from_utf16({ replacement_whitespace_utf16_view.substring_view(1) })); - auto element = replacement_whitespace_utf16_view.code_point_at(0); + auto replacement_whitespace_utf16 = Utf16String::from_utf8(replacement_whitespace); + replacement_whitespace = MUST(replacement_whitespace_utf16.substring_view(1).to_utf8()); + auto element = replacement_whitespace_utf16.code_point_at(0); // 2. If element is not the same as the start offsetth code unit of start node's data: - auto start_node_data = *start_node->text_content(); - auto start_node_utf16 = MUST(AK::utf8_to_utf16(start_node_data)); - auto start_node_utf16_view = Utf16View { start_node_utf16 }; - auto start_node_code_point = start_node_utf16_view.code_point_at(start_offset); + auto start_node_data = Utf16String::from_utf8(*start_node->text_content()); + auto start_node_code_point = start_node_data.code_point_at(start_offset); + if (element != start_node_code_point) { // 1. Call insertData(start offset, element) on start node. auto& start_node_character_data = static_cast(*start_node); diff --git a/Libraries/LibWeb/FileAPI/FileReader.cpp b/Libraries/LibWeb/FileAPI/FileReader.cpp index 067519e2575..39a02437700 100644 --- a/Libraries/LibWeb/FileAPI/FileReader.cpp +++ b/Libraries/LibWeb/FileAPI/FileReader.cpp @@ -106,11 +106,10 @@ WebIDL::ExceptionOr FileReader::blob_package_data(JS::Realm& return JS::ArrayBuffer::create(realm, move(bytes)); case Type::BinaryString: // Return bytes as a binary string, in which every byte is represented by a code unit of equal value [0..255]. - Utf16Data builder; - builder.ensure_capacity(bytes.size()); + StringBuilder builder(StringBuilder::Mode::UTF16, bytes.size()); for (auto byte : bytes.bytes()) - builder.unchecked_append(byte); - return MUST(Utf16View { builder }.to_utf8()); + builder.append_code_unit(byte); + return MUST(builder.utf16_string_view().to_utf8()); } VERIFY_NOT_REACHED(); } diff --git a/Libraries/LibWeb/Infra/Strings.cpp b/Libraries/LibWeb/Infra/Strings.cpp index cbff0192f84..11c4ff45b17 100644 --- a/Libraries/LibWeb/Infra/Strings.cpp +++ b/Libraries/LibWeb/Infra/Strings.cpp @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -63,10 +64,8 @@ ErrorOr strip_and_collapse_whitespace(StringView string) // https://infra.spec.whatwg.org/#code-unit-prefix bool is_code_unit_prefix(StringView potential_prefix_utf8, StringView input_utf8) { - auto potential_prefix_utf16_bytes = MUST(utf8_to_utf16(potential_prefix_utf8)); - auto input_utf16_bytes = MUST(utf8_to_utf16(input_utf8)); - Utf16View potential_prefix { potential_prefix_utf16_bytes }; - Utf16View input { input_utf16_bytes }; + auto potential_prefix = Utf16String::from_utf8(potential_prefix_utf8); + auto input = Utf16String::from_utf8(input_utf8); // 1. Let i be 0. size_t i = 0; @@ -148,9 +147,10 @@ bool code_unit_less_than(StringView a, StringView b) if (a.is_ascii() && b.is_ascii()) return a < b; - auto a_utf16 = MUST(utf8_to_utf16(a)); - auto b_utf16 = MUST(utf8_to_utf16(b)); - return Utf16View { a_utf16 }.is_code_unit_less_than(Utf16View { b_utf16 }); + auto a_utf16 = Utf16String::from_utf8(a); + auto b_utf16 = Utf16String::from_utf8(b); + + return a_utf16.utf16_view().is_code_unit_less_than(b_utf16); } } diff --git a/Libraries/LibWeb/Layout/Viewport.cpp b/Libraries/LibWeb/Layout/Viewport.cpp index 6fcc55ced36..8f0fac3bb3a 100644 --- a/Libraries/LibWeb/Layout/Viewport.cpp +++ b/Libraries/LibWeb/Layout/Viewport.cpp @@ -50,17 +50,18 @@ Vector const& Viewport::text_blocks() void Viewport::update_text_blocks() { - StringBuilder builder; + StringBuilder builder(StringBuilder::Mode::UTF16); size_t current_start_position = 0; Vector text_positions; Vector text_blocks; + for_each_in_inclusive_subtree([&](auto const& layout_node) { if (layout_node.display().is_none() || !layout_node.first_paintable() || !layout_node.first_paintable()->is_visible()) return TraversalDecision::Continue; if (layout_node.is_box() || layout_node.is_generated()) { if (!builder.is_empty()) { - text_blocks.append({ MUST(AK::utf8_to_utf16(builder.string_view())), text_positions }); + text_blocks.append({ builder.to_utf16_string(), text_positions }); current_start_position = 0; text_positions.clear_with_capacity(); builder.clear(); @@ -79,10 +80,9 @@ void Viewport::update_text_blocks() text_positions.empend(dom_node, current_start_position); } - auto const& current_node_text = text_node->text_for_rendering(); - auto const current_node_text_utf16 = MUST(AK::utf8_to_utf16(current_node_text)); - current_start_position += current_node_text_utf16.data.size(); - builder.append(move(current_node_text)); + auto const& current_node_text = Utf16String::from_utf8(text_node->text_for_rendering()); + current_start_position += current_node_text.length_in_code_units(); + builder.append(current_node_text); } } @@ -90,7 +90,7 @@ void Viewport::update_text_blocks() }); if (!builder.is_empty()) - text_blocks.append({ MUST(AK::utf8_to_utf16(builder.string_view())), text_positions }); + text_blocks.append({ builder.to_utf16_string(), text_positions }); m_text_blocks = move(text_blocks); } diff --git a/Libraries/LibWeb/Layout/Viewport.h b/Libraries/LibWeb/Layout/Viewport.h index a39a2b1fd6b..fb5385e1dd1 100644 --- a/Libraries/LibWeb/Layout/Viewport.h +++ b/Libraries/LibWeb/Layout/Viewport.h @@ -6,6 +6,7 @@ #pragma once +#include #include #include @@ -24,7 +25,7 @@ public: size_t start_offset { 0 }; }; struct TextBlock { - AK::Utf16ConversionResult text; + Utf16String text; Vector positions; }; Vector const& text_blocks(); diff --git a/Libraries/LibWeb/Painting/PaintableFragment.cpp b/Libraries/LibWeb/Painting/PaintableFragment.cpp index 00541bb278a..da4ac8c7df9 100644 --- a/Libraries/LibWeb/Painting/PaintableFragment.cpp +++ b/Libraries/LibWeb/Painting/PaintableFragment.cpp @@ -252,9 +252,8 @@ Utf16View PaintableFragment::utf16_view() const return {}; if (!m_text_in_utf16.has_value()) - m_text_in_utf16 = MUST(AK::utf8_to_utf16(utf8_view())); - - return Utf16View { m_text_in_utf16.value() }; + m_text_in_utf16 = Utf16String::from_utf8(utf8_view().as_string()); + return *m_text_in_utf16; } } diff --git a/Libraries/LibWeb/Painting/PaintableFragment.h b/Libraries/LibWeb/Painting/PaintableFragment.h index 0bd69235cd8..bd4f68ec1a3 100644 --- a/Libraries/LibWeb/Painting/PaintableFragment.h +++ b/Libraries/LibWeb/Painting/PaintableFragment.h @@ -6,6 +6,7 @@ #pragma once +#include #include #include #include @@ -64,7 +65,7 @@ private: CSS::WritingMode m_writing_mode; Vector m_shadows; CSSPixels m_text_decoration_thickness { 0 }; - mutable Optional m_text_in_utf16; + mutable Optional m_text_in_utf16; }; } diff --git a/Libraries/LibWeb/SVG/SVGTextContentElement.cpp b/Libraries/LibWeb/SVG/SVGTextContentElement.cpp index 2c868316072..5ae4e36448f 100644 --- a/Libraries/LibWeb/SVG/SVGTextContentElement.cpp +++ b/Libraries/LibWeb/SVG/SVGTextContentElement.cpp @@ -48,8 +48,8 @@ ByteString SVGTextContentElement::text_contents() const // https://svgwg.org/svg2-draft/text.html#__svg__SVGTextContentElement__getNumberOfChars WebIDL::ExceptionOr SVGTextContentElement::get_number_of_chars() const { - auto chars = TRY_OR_THROW_OOM(vm(), utf8_to_utf16(text_contents())).data; - return static_cast(chars.size()); + auto length_in_code_units = AK::utf16_code_unit_length_from_utf8(text_contents()); + return static_cast(length_in_code_units); } GC::Ref SVGTextContentElement::get_start_position_of_char(WebIDL::UnsignedLong charnum) diff --git a/Tests/AK/TestUtf16View.cpp b/Tests/AK/TestUtf16View.cpp index ef7086e71c2..eed4f748fe0 100644 --- a/Tests/AK/TestUtf16View.cpp +++ b/Tests/AK/TestUtf16View.cpp @@ -15,7 +15,7 @@ TEST_CASE(decode_ascii) { - auto string = MUST(AK::utf8_to_utf16("Hello World!11"sv)); + auto string = Utf16String::from_utf8("Hello World!11"sv); Utf16View view { string }; size_t valid_code_units = 0; @@ -34,7 +34,7 @@ TEST_CASE(decode_ascii) TEST_CASE(decode_utf8) { - auto string = MUST(AK::utf8_to_utf16("Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv)); + auto string = Utf16String::from_utf8("Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv); Utf16View view { string }; size_t valid_code_units = 0; @@ -55,7 +55,7 @@ TEST_CASE(encode_utf8) { { auto utf8_string = "Привет, мир! 😀 γειά σου κόσμος こんにちは世界"_string; - auto string = MUST(AK::utf8_to_utf16(utf8_string)); + auto string = Utf16String::from_utf8(utf8_string); Utf16View view { string }; EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::Yes)), utf8_string); EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::No)), utf8_string); @@ -139,7 +139,7 @@ TEST_CASE(utf16_literal) TEST_CASE(iterate_utf16) { - auto string = MUST(AK::utf8_to_utf16("Привет 😀"sv)); + auto string = Utf16String::from_utf8("Привет 😀"sv); Utf16View view { string }; auto iterator = view.begin(); @@ -371,16 +371,16 @@ TEST_CASE(to_ascii_titlecase) TEST_CASE(equals_ignoring_case) { - auto string1 = MUST(AK::utf8_to_utf16("foobar"sv)); - auto string2 = MUST(AK::utf8_to_utf16("FooBar"sv)); + auto string1 = Utf16String::from_utf8("foobar"sv); + auto string2 = Utf16String::from_utf8("FooBar"sv); EXPECT(Utf16View { string1 }.equals_ignoring_case(Utf16View { string2 })); - string1 = MUST(AK::utf8_to_utf16(""sv)); - string2 = MUST(AK::utf8_to_utf16(""sv)); + string1 = Utf16String::from_utf8(""sv); + string2 = Utf16String::from_utf8(""sv); EXPECT(Utf16View { string1 }.equals_ignoring_case(Utf16View { string2 })); - string1 = MUST(AK::utf8_to_utf16(""sv)); - string2 = MUST(AK::utf8_to_utf16("FooBar"sv)); + string1 = Utf16String::from_utf8(""sv); + string2 = Utf16String::from_utf8("FooBar"sv); EXPECT(!Utf16View { string1 }.equals_ignoring_case(Utf16View { string2 })); } @@ -425,7 +425,7 @@ TEST_CASE(replace) TEST_CASE(substring_view) { - auto string = MUST(AK::utf8_to_utf16("Привет 😀"sv)); + auto string = Utf16String::from_utf8("Привет 😀"sv); { Utf16View view { string }; view = view.substring_view(7, 2); @@ -532,7 +532,7 @@ TEST_CASE(starts_with) TEST_CASE(find_code_unit_offset) { - auto conversion_result = MUST(AK::utf8_to_utf16("😀foo😀bar"sv)); + auto conversion_result = Utf16String::from_utf8("😀foo😀bar"sv); Utf16View const view { conversion_result }; EXPECT_EQ(0u, view.find_code_unit_offset(u""sv).value()); @@ -549,7 +549,7 @@ TEST_CASE(find_code_unit_offset) TEST_CASE(find_code_unit_offset_ignoring_case) { - auto conversion_result = MUST(AK::utf8_to_utf16("😀Foo😀Bar"sv)); + auto conversion_result = Utf16String::from_utf8("😀Foo😀Bar"sv); Utf16View const view { conversion_result }; EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u""sv).value()); diff --git a/Tests/LibRegex/TestRegex.cpp b/Tests/LibRegex/TestRegex.cpp index 8bb355106ab..e0345476006 100644 --- a/Tests/LibRegex/TestRegex.cpp +++ b/Tests/LibRegex/TestRegex.cpp @@ -823,7 +823,7 @@ TEST_CASE(ECMA262_unicode_match) for (auto& test : tests) { Regex re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | test.options); - auto subject = MUST(AK::utf8_to_utf16(test.subject)); + auto subject = Utf16String::from_utf8(test.subject); Utf16View view { subject }; if constexpr (REGEX_DEBUG) { @@ -956,7 +956,7 @@ TEST_CASE(ECMA262_property_match) for (auto& test : tests) { Regex re(test.pattern, (ECMAScriptFlags)regex::AllFlags::Global | regex::ECMAScriptFlags::BrowserExtended | test.options); - auto subject = MUST(AK::utf8_to_utf16(test.subject)); + auto subject = Utf16String::from_utf8(test.subject); Utf16View view { subject }; if constexpr (REGEX_DEBUG) { diff --git a/Tests/LibUnicode/TestSegmenter.cpp b/Tests/LibUnicode/TestSegmenter.cpp index ccb17bd26ad..66bb870636c 100644 --- a/Tests/LibUnicode/TestSegmenter.cpp +++ b/Tests/LibUnicode/TestSegmenter.cpp @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -155,21 +156,21 @@ TEST_CASE(out_of_bounds) EXPECT(!result.has_value()); } { - auto text = MUST(AK::utf8_to_utf16("foo"sv)); + auto text = u"foo"_utf16; auto segmenter = Unicode::Segmenter::create(Unicode::SegmenterGranularity::Word); - segmenter->set_segmented_text(Utf16View { text }); + segmenter->set_segmented_text(text); - auto result = segmenter->previous_boundary(text.data.size() + 1); + auto result = segmenter->previous_boundary(text.length_in_code_units() + 1); EXPECT(result.has_value()); - result = segmenter->next_boundary(text.data.size() + 1); + result = segmenter->next_boundary(text.length_in_code_units() + 1); EXPECT(!result.has_value()); - result = segmenter->previous_boundary(text.data.size()); + result = segmenter->previous_boundary(text.length_in_code_units()); EXPECT(result.has_value()); - result = segmenter->next_boundary(text.data.size()); + result = segmenter->next_boundary(text.length_in_code_units()); EXPECT(!result.has_value()); result = segmenter->next_boundary(0);