From 86b1c78c1a428d489de433b9f9bba31799203a42 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Thu, 26 Jun 2025 12:52:23 -0400 Subject: [PATCH] AK+Everywhere: Prepare Utf16View for integration with a UTF-16 string To prepare for an upcoming Utf16String, this migrates Utf16View to store its data as a char16_t. Most function definitions are moved inline and made constexpr. This also adds a UDL to construct a Utf16View from a string literal: auto string = u"hello"sv; This let's us remove the NTTP Utf16View constructor, as we have found that such constructors bloat binary size quite a bit. --- AK/String.cpp | 17 +- AK/StringBase.h | 6 + AK/StringBuilder.cpp | 18 +- AK/StringView.cpp | 4 +- AK/UnicodeUtils.h | 2 +- AK/Utf16View.cpp | 293 ++++------------- AK/Utf16View.h | 294 ++++++++++++++---- Libraries/LibCore/ProcessWindows.cpp | 2 +- .../LibJS/Runtime/AbstractOperations.cpp | 28 +- Libraries/LibJS/Runtime/Utf16String.cpp | 2 +- Libraries/LibJS/Runtime/Utf16String.h | 2 +- Libraries/LibRegex/RegexMatch.h | 3 +- Libraries/LibUnicode/ICU.cpp | 3 +- Libraries/LibUnicode/Segmenter.cpp | 2 +- Libraries/LibWeb/DOM/CharacterData.cpp | 5 +- Libraries/LibWeb/FileAPI/FileReader.cpp | 2 +- Tests/AK/TestUtf16View.cpp | 144 ++++----- 17 files changed, 406 insertions(+), 421 deletions(-) diff --git a/AK/String.cpp b/AK/String.cpp index 37d8381003b..e430dede5ea 100644 --- a/AK/String.cpp +++ b/AK/String.cpp @@ -93,22 +93,7 @@ ErrorOr String::from_utf16_be(ReadonlyBytes bytes) ErrorOr String::from_utf16(Utf16View const& utf16) { - if (!utf16.validate()) - return Error::from_string_literal("String::from_utf16: Input was not valid UTF-16"); - if (utf16.is_empty()) - return String {}; - - String result; - - auto utf8_length = simdutf::utf8_length_from_utf16(utf16.char_data(), utf16.length_in_code_units()); - - TRY(result.replace_with_new_string(utf8_length, [&](Bytes buffer) -> ErrorOr { - [[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(utf16.char_data(), utf16.length_in_code_units(), reinterpret_cast(buffer.data())); - ASSERT(result == buffer.size()); - return {}; - })); - - return result; + return utf16.to_utf8(); } ErrorOr String::from_stream(Stream& stream, size_t byte_count) diff --git a/AK/StringBase.h b/AK/StringBase.h index 50635e9c8d3..f9fc61ed30e 100644 --- a/AK/StringBase.h +++ b/AK/StringBase.h @@ -86,6 +86,12 @@ public: return replace_with_new_string(byte_count, forward(callback)); } + template + ALWAYS_INLINE ErrorOr replace_with_new_string(Badge, size_t byte_count, Func&& callback) + { + return replace_with_new_string(byte_count, forward(callback)); + } + protected: template ErrorOr replace_with_new_string(size_t byte_count, Func&& callback) diff --git a/AK/StringBuilder.cpp b/AK/StringBuilder.cpp index 55ff609ef12..53bfa358ae9 100644 --- a/AK/StringBuilder.cpp +++ b/AK/StringBuilder.cpp @@ -250,17 +250,17 @@ ErrorOr StringBuilder::try_append(Utf16View const& utf16_view) if (utf16_view.is_empty()) return {}; - auto maximum_utf8_length = UnicodeUtils::maximum_utf8_length_from_utf16(utf16_view.span()); + auto remaining_view = utf16_view.span(); + auto maximum_utf8_length = UnicodeUtils::maximum_utf8_length_from_utf16(remaining_view); // Possibly over-allocate a little to ensure we don't have to allocate later. TRY(will_append(maximum_utf8_length)); - Utf16View remaining_view = utf16_view; for (;;) { - auto uninitialized_data_pointer = static_cast(m_buffer.end_pointer()); + auto* uninitialized_data_pointer = static_cast(m_buffer.end_pointer()); // Fast path. - auto result = simdutf::convert_utf16_to_utf8_with_errors(remaining_view.char_data(), remaining_view.length_in_code_units(), uninitialized_data_pointer); + auto result = simdutf::convert_utf16_to_utf8_with_errors(remaining_view.data(), remaining_view.size(), uninitialized_data_pointer); if (result.error == simdutf::SUCCESS) { auto bytes_just_written = result.count; m_buffer.set_size(m_buffer.size() + bytes_just_written); @@ -269,13 +269,13 @@ ErrorOr StringBuilder::try_append(Utf16View const& utf16_view) // Slow path. Found unmatched surrogate code unit. auto first_invalid_code_unit = result.count; - ASSERT(first_invalid_code_unit < remaining_view.length_in_code_units()); + ASSERT(first_invalid_code_unit < remaining_view.size()); // Unfortunately, `simdutf` does not tell us how many bytes it just wrote in case of an error, so we have to calculate it ourselves. - auto bytes_just_written = simdutf::utf8_length_from_utf16(remaining_view.char_data(), first_invalid_code_unit); + auto bytes_just_written = simdutf::utf8_length_from_utf16(remaining_view.data(), first_invalid_code_unit); do { - auto code_unit = remaining_view.code_unit_at(first_invalid_code_unit++); + auto code_unit = remaining_view[first_invalid_code_unit++]; // Invalid surrogate code units are U+D800 - U+DFFF, so they are always encoded using 3 bytes. ASSERT(code_unit >= 0xD800 && code_unit <= 0xDFFF); @@ -283,11 +283,11 @@ ErrorOr StringBuilder::try_append(Utf16View const& utf16_view) uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 12) & 0x0f) | 0xe0); uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 6) & 0x3f) | 0x80); uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 0) & 0x3f) | 0x80); - } while (first_invalid_code_unit < remaining_view.length_in_code_units() && UnicodeUtils::is_utf16_low_surrogate(remaining_view.data()[first_invalid_code_unit])); + } while (first_invalid_code_unit < remaining_view.size() && UnicodeUtils::is_utf16_low_surrogate(remaining_view.data()[first_invalid_code_unit])); // Code unit might no longer be invalid, retry on the remaining data. m_buffer.set_size(m_buffer.size() + bytes_just_written); - remaining_view = remaining_view.substring_view(first_invalid_code_unit); + remaining_view = remaining_view.slice(first_invalid_code_unit); } return {}; diff --git a/AK/StringView.cpp b/AK/StringView.cpp index b5519a3fe1b..d6039331f36 100644 --- a/AK/StringView.cpp +++ b/AK/StringView.cpp @@ -212,7 +212,7 @@ String StringView::to_ascii_lowercase_string() const String result; - MUST(result.replace_with_new_string({}, length(), [&](Bytes buffer) -> ErrorOr { + MUST(result.replace_with_new_string(Badge {}, length(), [&](Bytes buffer) -> ErrorOr { for (auto [i, character] : enumerate(bytes())) buffer[i] = static_cast(AK::to_ascii_lowercase(character)); return {}; @@ -227,7 +227,7 @@ String StringView::to_ascii_uppercase_string() const String result; - MUST(result.replace_with_new_string({}, length(), [&](Bytes buffer) -> ErrorOr { + MUST(result.replace_with_new_string(Badge {}, length(), [&](Bytes buffer) -> ErrorOr { for (auto [i, character] : enumerate(bytes())) buffer[i] = static_cast(AK::to_ascii_uppercase(character)); return {}; diff --git a/AK/UnicodeUtils.h b/AK/UnicodeUtils.h index a9518b55314..6c1cb6fc906 100644 --- a/AK/UnicodeUtils.h +++ b/AK/UnicodeUtils.h @@ -158,7 +158,7 @@ constexpr ErrorOr try_code_point_to_utf16(u32 code_point, Callback callb * Compute the maximum number of UTF-8 bytes needed to store a given UTF-16 string, accounting for unmatched UTF-16 surrogates. * This function will overcount by at most 33%; 2 bytes for every valid UTF-16 codepoint between U+100000 and U+10FFFF. */ -[[nodiscard]] static inline size_t maximum_utf8_length_from_utf16(ReadonlySpan code_units) +[[nodiscard]] static inline size_t maximum_utf8_length_from_utf16(ReadonlySpan code_units) { // # UTF-8 code point -> no. UTF-8 bytes needed // U+0000 - U+007F => 1 UTF-8 bytes diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index 3ba4fe370af..84a43c939c6 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -80,77 +80,75 @@ ErrorOr utf32_to_utf16(Utf32View const& utf32_view) return Utf16ConversionResult { utf16_data, length }; } +bool validate_utf16_le(ReadonlyBytes bytes) +{ + return simdutf::validate_utf16le(reinterpret_cast(bytes.data()), bytes.size() / 2); +} + +bool validate_utf16_be(ReadonlyBytes bytes) +{ + return simdutf::validate_utf16be(reinterpret_cast(bytes.data()), bytes.size() / 2); +} + size_t utf16_code_unit_length_from_utf8(StringView string) { return simdutf::utf16_length_from_utf8(string.characters_without_null_termination(), string.length()); } +ErrorOr Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const +{ + if (is_empty()) + return String {}; + if (!validate(allow_invalid_code_units)) + return Error::from_string_literal("Input was not valid UTF-16"); + + if (allow_invalid_code_units == AllowInvalidCodeUnits::No) { + String result; + auto utf8_length = simdutf::utf8_length_from_utf16(m_string, length_in_code_units()); + + TRY(result.replace_with_new_string(Badge {}, utf8_length, [&](Bytes buffer) -> ErrorOr { + [[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(m_string, length_in_code_units(), reinterpret_cast(buffer.data())); + ASSERT(result == buffer.size()); + return {}; + })); + + return result; + } + + StringBuilder builder; + builder.append(*this); + return builder.to_string(); +} + ErrorOr Utf16View::to_byte_string(AllowInvalidCodeUnits allow_invalid_code_units) const { return TRY(to_utf8(allow_invalid_code_units)).to_byte_string(); } -ErrorOr Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const -{ - if (allow_invalid_code_units == AllowInvalidCodeUnits::No) - return String::from_utf16(*this); - - StringBuilder builder; - builder.append(*this); - return builder.to_string(); -} - bool Utf16View::is_ascii() const { - return simdutf::validate_ascii(reinterpret_cast(m_code_units.data()), length_in_code_units() * sizeof(char16_t)); + return simdutf::validate_ascii(reinterpret_cast(m_string), length_in_code_units() * sizeof(char16_t)); } -size_t Utf16View::length_in_code_points() const +bool Utf16View::validate(size_t& valid_code_units, AllowInvalidCodeUnits allow_invalid_code_units) const { - if (m_length_in_code_points == NumericLimits::max()) - m_length_in_code_points = calculate_length_in_code_points(); - return m_length_in_code_points; -} + auto view = *this; + valid_code_units = 0; -u16 Utf16View::code_unit_at(size_t index) const -{ - VERIFY(index < length_in_code_units()); - return m_code_units[index]; -} + while (!view.is_empty()) { + auto result = simdutf::validate_utf16_with_errors(view.m_string, view.length_in_code_units()); + valid_code_units += result.count; -u32 Utf16View::code_point_at(size_t index) const -{ - VERIFY(index < length_in_code_units()); + if (result.error == simdutf::SUCCESS) + return true; + if (allow_invalid_code_units == AllowInvalidCodeUnits::No || result.error != simdutf::SURROGATE) + return false; - u32 code_point = code_unit_at(index); - if (!UnicodeUtils::is_utf16_high_surrogate(code_point) && !UnicodeUtils::is_utf16_low_surrogate(code_point)) - return code_point; - if (UnicodeUtils::is_utf16_low_surrogate(code_point) || (index + 1 == length_in_code_units())) - return code_point; - - auto second = code_unit_at(index + 1); - if (!UnicodeUtils::is_utf16_low_surrogate(second)) - return code_point; - - return UnicodeUtils::decode_utf16_surrogate_pair(code_point, second); -} - -size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const -{ - if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit. - return code_unit_offset; - - size_t code_point_offset = 0; - - for (auto it = begin(); it != end(); ++it) { - if (code_unit_offset == 0) - return code_point_offset; - - code_unit_offset -= it.length_in_code_units(); - ++code_point_offset; + view = view.substring_view(result.count + 1); + ++valid_code_units; } - return code_point_offset; + return true; } size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const @@ -171,19 +169,22 @@ size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const return code_unit_offset; } -size_t Utf16View::code_unit_offset_of(Utf16CodePointIterator const& it) const +size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const { - VERIFY(it.m_ptr >= begin_ptr()); - VERIFY(it.m_ptr <= end_ptr()); + if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit. + return code_unit_offset; - return it.m_ptr - begin_ptr(); -} + size_t code_point_offset = 0; -Utf16View Utf16View::substring_view(size_t code_unit_offset, size_t code_unit_length) const -{ - VERIFY(!Checked::addition_would_overflow(code_unit_offset, code_unit_length)); + for (auto it = begin(); it != end(); ++it) { + if (code_unit_offset == 0) + return code_point_offset; - return Utf16View { m_code_units.slice(code_unit_offset, code_unit_length) }; + code_unit_offset -= it.length_in_code_units(); + ++code_point_offset; + } + + return code_point_offset; } Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t code_point_length) const @@ -194,7 +195,10 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit. return substring_view(code_point_offset, code_point_length); - auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) { return it.m_ptr - begin_ptr(); }; + auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) { + return it.m_iterator - m_string; + }; + size_t code_point_index = 0; size_t code_unit_offset = 0; @@ -213,101 +217,13 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod VERIFY_NOT_REACHED(); } -Optional Utf16View::find_code_unit_offset(Utf16View const& needle, size_t start_offset) const -{ - return m_code_units.index_of(needle.m_code_units, start_offset); -} - -Optional Utf16View::find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset) const -{ - Checked maximum_offset { start_offset }; - maximum_offset += needle.length_in_code_units(); - if (maximum_offset.has_overflow() || maximum_offset.value() > length_in_code_units()) - return {}; - - if (needle.is_empty()) - return start_offset; - - size_t index = start_offset; - while (index <= length_in_code_units() - needle.length_in_code_units()) { - Utf16View const slice { m_code_units.slice(index, needle.length_in_code_units()) }; - if (slice.equals_ignoring_case(needle)) - return index; - index += slice.begin().length_in_code_units(); - } - - return {}; -} - -bool Utf16View::starts_with(Utf16View const& needle) const -{ - if (needle.is_empty()) - return true; - if (is_empty()) - return false; - if (needle.length_in_code_units() > length_in_code_units()) - return false; - if (begin_ptr() == needle.begin_ptr()) - return true; - - for (auto this_it = begin(), needle_it = needle.begin(); needle_it != needle.end(); ++needle_it, ++this_it) { - if (*this_it != *needle_it) - return false; - } - - return true; -} - -// https://infra.spec.whatwg.org/#code-unit-less-than -bool Utf16View::is_code_unit_less_than(Utf16View const& other) const -{ - auto a = m_code_units; - auto b = other.m_code_units; - - auto common_length = min(a.size(), b.size()); - - for (size_t position = 0; position < common_length; ++position) { - if (a[position] != b[position]) - return a[position] < b[position]; - } - - return a.size() < b.size(); -} - -bool Utf16View::validate(AllowInvalidCodeUnits allow_invalid_code_units) const -{ - size_t valid_code_units = 0; - return validate(valid_code_units, allow_invalid_code_units); -} - -bool Utf16View::validate(size_t& valid_code_units, AllowInvalidCodeUnits allow_invalid_code_units) const -{ - auto view = *this; - valid_code_units = 0; - - while (!view.is_empty()) { - auto result = simdutf::validate_utf16_with_errors(view.char_data(), view.length_in_code_units()); - valid_code_units += result.count; - - if (result.error == simdutf::SUCCESS) - return true; - if (allow_invalid_code_units == AllowInvalidCodeUnits::No || result.error != simdutf::SURROGATE) - return false; - - view = view.substring_view(result.count + 1); - ++valid_code_units; - } - - return true; -} - size_t Utf16View::calculate_length_in_code_points() const { // FIXME: simdutf's code point length method assumes valid UTF-16, whereas Utf16View uses U+FFFD as a replacement // for invalid code points. If we change Utf16View to only accept valid encodings as an invariant, we can // remove this branch. if (validate()) [[likely]] - return simdutf::count_utf16(char_data(), length_in_code_units()); + return simdutf::count_utf16(m_string, length_in_code_units()); size_t code_points = 0; for ([[maybe_unused]] auto code_point : *this) @@ -315,81 +231,4 @@ size_t Utf16View::calculate_length_in_code_points() const return code_points; } -bool Utf16View::equals_ignoring_case(Utf16View const& other) const -{ - if (length_in_code_units() != other.length_in_code_units()) - return false; - - for (size_t i = 0; i < length_in_code_units(); ++i) { - // FIXME: Handle non-ASCII case insensitive comparisons. - if (to_ascii_lowercase(m_code_units[i]) != to_ascii_lowercase(other.m_code_units[i])) - return false; - } - - return true; -} - -Utf16CodePointIterator& Utf16CodePointIterator::operator++() -{ - size_t code_units = length_in_code_units(); - - if (code_units > m_remaining_code_units) { - // If there aren't enough code units remaining, skip to the end. - m_ptr += m_remaining_code_units; - m_remaining_code_units = 0; - } else { - m_ptr += code_units; - m_remaining_code_units -= code_units; - } - - return *this; -} - -u32 Utf16CodePointIterator::operator*() const -{ - VERIFY(m_remaining_code_units > 0); - - // rfc2781, 2.2 Decoding UTF-16 - // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value - // of W1. Terminate. - // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence - // is in error and no valid character can be obtained using W1. - // Terminate. - // 3) If there is no W2 (that is, the sequence ends with W1), or if W2 - // is not between 0xDC00 and 0xDFFF, the sequence is in error. - // Terminate. - // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order - // bits of W1 as its 10 high-order bits and the 10 low-order bits of - // W2 as its 10 low-order bits. - // 5) Add 0x10000 to U' to obtain the character value U. Terminate. - - auto code_unit = *m_ptr; - - if (UnicodeUtils::is_utf16_high_surrogate(code_unit)) { - if (m_remaining_code_units > 1) { - auto next_code_unit = *(m_ptr + 1); - - if (UnicodeUtils::is_utf16_low_surrogate(next_code_unit)) - return UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit); - } - - return UnicodeUtils::REPLACEMENT_CODE_POINT; - } - - if (UnicodeUtils::is_utf16_low_surrogate(code_unit)) - return UnicodeUtils::REPLACEMENT_CODE_POINT; - - return static_cast(code_unit); -} - -bool validate_utf16_le(ReadonlyBytes bytes) -{ - return simdutf::validate_utf16le(reinterpret_cast(bytes.data()), bytes.size() / 2); -} - -bool validate_utf16_be(ReadonlyBytes bytes) -{ - return simdutf::validate_utf16be(reinterpret_cast(bytes.data()), bytes.size() / 2); -} - } diff --git a/AK/Utf16View.h b/AK/Utf16View.h index 5bcd7b727f4..2eb04632a1e 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -21,7 +22,7 @@ namespace AK { -using Utf16Data = Vector; +using Utf16Data = Vector; struct Utf16ConversionResult { Utf16Data data; @@ -36,8 +37,6 @@ ErrorOr utf32_to_utf16(Utf32View const&); size_t utf16_code_unit_length_from_utf8(StringView); -class Utf16View; - class Utf16CodePointIterator { friend class Utf16View; @@ -45,27 +44,57 @@ public: Utf16CodePointIterator() = default; ~Utf16CodePointIterator() = default; - bool operator==(Utf16CodePointIterator const& other) const + constexpr Utf16CodePointIterator& operator++() { - return (m_ptr == other.m_ptr) && (m_remaining_code_units == other.m_remaining_code_units); + VERIFY(m_remaining_code_units > 0); + + auto length = min(length_in_code_units(), m_remaining_code_units); + m_iterator += length; + m_remaining_code_units -= length; + + return *this; } - Utf16CodePointIterator& operator++(); - u32 operator*() const; + constexpr u32 operator*() const + { + VERIFY(m_remaining_code_units > 0); + auto code_unit = *m_iterator; - size_t length_in_code_units() const + if (UnicodeUtils::is_utf16_high_surrogate(code_unit)) { + if (m_remaining_code_units > 1) { + auto next_code_unit = *(m_iterator + 1); + + if (UnicodeUtils::is_utf16_low_surrogate(next_code_unit)) + return UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit); + } + + return UnicodeUtils::REPLACEMENT_CODE_POINT; + } + + if (UnicodeUtils::is_utf16_low_surrogate(code_unit)) + return UnicodeUtils::REPLACEMENT_CODE_POINT; + + return static_cast(code_unit); + } + + [[nodiscard]] constexpr bool operator==(Utf16CodePointIterator const& other) const + { + return (m_iterator == other.m_iterator) && (m_remaining_code_units == other.m_remaining_code_units); + } + + [[nodiscard]] constexpr size_t length_in_code_units() const { return UnicodeUtils::code_unit_length_for_code_point(**this); } private: - Utf16CodePointIterator(u16 const* ptr, size_t length) - : m_ptr(ptr) + Utf16CodePointIterator(char16_t const* ptr, size_t length) + : m_iterator(ptr) , m_remaining_code_units(length) { } - u16 const* m_ptr { nullptr }; + char16_t const* m_iterator { nullptr }; size_t m_remaining_code_units { 0 }; }; @@ -73,101 +102,233 @@ class Utf16View { public: using Iterator = Utf16CodePointIterator; + enum class AllowInvalidCodeUnits { + No, + Yes, + }; + Utf16View() = default; ~Utf16View() = default; - explicit Utf16View(ReadonlySpan code_units) - : m_code_units(code_units) + constexpr Utf16View(char16_t const* string, size_t length_in_code_units) + : m_string(string) + , m_length_in_code_units(length_in_code_units) + { + } + + constexpr Utf16View(Utf16Data const& string) + : m_string(string.data()) + , m_length_in_code_units(string.size()) { } Utf16View(Utf16ConversionResult&&) = delete; explicit Utf16View(Utf16ConversionResult const& conversion_result) - : m_code_units(conversion_result.data) + : m_string(conversion_result.data.data()) + , m_length_in_code_units(conversion_result.data.size()) , m_length_in_code_points(conversion_result.code_point_count) { } - template - Utf16View(char16_t const (&code_units)[Size]) - : m_code_units( - reinterpret_cast(&code_units[0]), - code_units[Size - 1] == u'\0' ? Size - 1 : Size) + ErrorOr to_utf8(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; + ErrorOr to_byte_string(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; + + [[nodiscard]] constexpr ReadonlySpan span() const { + return { m_string, length_in_code_units() }; } - bool operator==(Utf16View const& other) const { return m_code_units == other.m_code_units; } + [[nodiscard]] constexpr bool operator==(Utf16View const& other) const + { + if (length_in_code_units() != other.length_in_code_units()) + return false; + return TypedTransfer::compare(m_string, other.m_string, length_in_code_units()); + } - enum class AllowInvalidCodeUnits { - Yes, - No, - }; + [[nodiscard]] constexpr bool equals_ignoring_case(Utf16View const& other) const + { + // FIXME: Handle non-ASCII case insensitive comparisons. + return equals_ignoring_ascii_case(other); + } - ErrorOr to_byte_string(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; - ErrorOr to_utf8(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; + [[nodiscard]] constexpr bool equals_ignoring_ascii_case(Utf16View const& other) const + { + if (length_in_code_units() != other.length_in_code_units()) + return false; - void unsafe_set_code_point_length(size_t length) const { m_length_in_code_points = length; } + for (size_t i = 0; i < length_in_code_units(); ++i) { + if (to_ascii_lowercase(code_unit_at(i)) != to_ascii_lowercase(other.code_unit_at(i))) + return false; + } - bool is_null() const { return m_code_units.is_null(); } - bool is_empty() const { return m_code_units.is_empty(); } - bool is_ascii() const; + return true; + } - size_t length_in_code_units() const { return m_code_units.size(); } - size_t length_in_code_points() const; + template + [[nodiscard]] constexpr bool is_one_of(Ts&&... strings) const + { + return (this->operator==(forward(strings)) || ...); + } - Optional length_in_code_points_if_known() const + template + [[nodiscard]] constexpr bool is_one_of_ignoring_ascii_case(Ts&&... strings) const + { + return (this->equals_ignoring_ascii_case(forward(strings)) || ...); + } + + [[nodiscard]] constexpr u32 hash() const + { + if (is_empty()) + return 0; + return string_hash(reinterpret_cast(m_string), length_in_code_units() * sizeof(char16_t)); + } + + [[nodiscard]] constexpr bool is_null() const { return m_string == nullptr; } + [[nodiscard]] constexpr bool is_empty() const { return length_in_code_units() == 0; } + [[nodiscard]] bool is_ascii() const; + + [[nodiscard]] ALWAYS_INLINE bool validate(AllowInvalidCodeUnits allow_invalid_code_units = AllowInvalidCodeUnits::No) const + { + size_t valid_code_units = 0; + return validate(valid_code_units, allow_invalid_code_units); + } + + [[nodiscard]] bool validate(size_t& valid_code_units, AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; + + [[nodiscard]] constexpr size_t length_in_code_units() const { return m_length_in_code_units; } + + [[nodiscard]] ALWAYS_INLINE size_t length_in_code_points() const + { + if (m_length_in_code_points == NumericLimits::max()) + m_length_in_code_points = calculate_length_in_code_points(); + return m_length_in_code_points; + } + + constexpr Optional length_in_code_points_if_known() const { if (m_length_in_code_points == NumericLimits::max()) return {}; return m_length_in_code_points; } - u32 hash() const + constexpr void unsafe_set_code_point_length(size_t length) const { m_length_in_code_points = length; } + + [[nodiscard]] constexpr char16_t code_unit_at(size_t index) const { - if (is_empty()) - return 0; - return string_hash(reinterpret_cast(m_code_units.data()), m_code_units.size() * sizeof(u16)); + VERIFY(index < length_in_code_units()); + return m_string[index]; } - Utf16CodePointIterator begin() const { return { begin_ptr(), m_code_units.size() }; } - Utf16CodePointIterator end() const { return { end_ptr(), 0 }; } + [[nodiscard]] constexpr u32 code_point_at(size_t index) const + { + VERIFY(index < length_in_code_units()); + u32 code_point = code_unit_at(index); - u16 const* data() const { return m_code_units.data(); } - char16_t const* char_data() const { return reinterpret_cast(data()); } + if (!UnicodeUtils::is_utf16_high_surrogate(code_point) && !UnicodeUtils::is_utf16_low_surrogate(code_point)) + return code_point; + if (UnicodeUtils::is_utf16_low_surrogate(code_point) || (index + 1 == length_in_code_units())) + return code_point; - ReadonlySpan span() const { return m_code_units; } + auto second = code_unit_at(index + 1); + if (!UnicodeUtils::is_utf16_low_surrogate(second)) + return code_point; - u16 code_unit_at(size_t index) const; - u32 code_point_at(size_t index) const; + return UnicodeUtils::decode_utf16_surrogate_pair(code_point, second); + } - size_t code_point_offset_of(size_t code_unit_offset) const; - size_t code_unit_offset_of(size_t code_point_offset) const; - size_t code_unit_offset_of(Utf16CodePointIterator const&) const; + [[nodiscard]] size_t code_unit_offset_of(size_t code_point_offset) const; + [[nodiscard]] size_t code_point_offset_of(size_t code_unit_offset) const; - Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const; - Utf16View substring_view(size_t code_unit_offset) const { return substring_view(code_unit_offset, length_in_code_units() - code_unit_offset); } + [[nodiscard]] constexpr Utf16CodePointIterator begin() const + { + return { m_string, length_in_code_units() }; + } - Utf16View unicode_substring_view(size_t code_point_offset, size_t code_point_length) const; - Utf16View unicode_substring_view(size_t code_point_offset) const { return unicode_substring_view(code_point_offset, length_in_code_points() - code_point_offset); } + [[nodiscard]] constexpr Utf16CodePointIterator end() const + { + return { m_string + length_in_code_units(), 0 }; + } - Optional find_code_unit_offset(Utf16View const& needle, size_t start_offset = 0) const; - Optional find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset = 0) const; + [[nodiscard]] constexpr Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const + { + VERIFY(code_unit_offset + code_unit_length <= length_in_code_units()); + return { m_string + code_unit_offset, code_unit_length }; + } - bool starts_with(Utf16View const&) const; - bool is_code_unit_less_than(Utf16View const& other) const; + [[nodiscard]] constexpr Utf16View substring_view(size_t code_unit_offset) const { return substring_view(code_unit_offset, length_in_code_units() - code_unit_offset); } - bool validate(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; - bool validate(size_t& valid_code_units, AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; + [[nodiscard]] Utf16View unicode_substring_view(size_t code_point_offset, size_t code_point_length) const; + [[nodiscard]] Utf16View unicode_substring_view(size_t code_point_offset) const { return unicode_substring_view(code_point_offset, length_in_code_points() - code_point_offset); } - bool equals_ignoring_case(Utf16View const&) const; + constexpr Optional find_code_unit_offset(char16_t needle, size_t start_offset = 0) const + { + if (start_offset >= length_in_code_units()) + return {}; + return AK::memmem_optional(m_string + start_offset, (length_in_code_units() - start_offset) * sizeof(char16_t), &needle, sizeof(needle)); + } + + constexpr Optional find_code_unit_offset(Utf16View const& needle, size_t start_offset = 0) const + { + return span().index_of(needle.span(), start_offset); + } + + constexpr Optional find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset = 0) const + { + Checked maximum_offset { start_offset }; + maximum_offset += needle.length_in_code_units(); + if (maximum_offset.has_overflow() || maximum_offset.value() > length_in_code_units()) + return {}; + + if (needle.is_empty()) + return start_offset; + + size_t index = start_offset; + while (index <= length_in_code_units() - needle.length_in_code_units()) { + auto slice = substring_view(index, needle.length_in_code_units()); + if (slice.equals_ignoring_case(needle)) + return index; + + index += slice.begin().length_in_code_units(); + } + + return {}; + } + + [[nodiscard]] constexpr bool starts_with(Utf16View const& needle) const + { + if (needle.is_empty()) + return true; + if (is_empty()) + return false; + if (needle.length_in_code_units() > length_in_code_units()) + return false; + + if (m_string == needle.m_string) + return true; + return span().starts_with(needle.span()); + } + + // https://infra.spec.whatwg.org/#code-unit-less-than + [[nodiscard]] constexpr bool is_code_unit_less_than(Utf16View const& other) const + { + auto common_length = min(length_in_code_units(), other.length_in_code_units()); + + for (size_t position = 0; position < common_length; ++position) { + auto this_code_unit = code_unit_at(position); + auto other_code_unit = other.code_unit_at(position); + + if (this_code_unit != other_code_unit) + return this_code_unit < other_code_unit; + } + + return length_in_code_units() < other.length_in_code_units(); + } private: - u16 const* begin_ptr() const { return m_code_units.data(); } - u16 const* end_ptr() const { return begin_ptr() + m_code_units.size(); } + [[nodiscard]] size_t calculate_length_in_code_points() const; - size_t calculate_length_in_code_points() const; - - ReadonlySpan m_code_units; + char16_t const* m_string { nullptr }; + size_t m_length_in_code_units { 0 }; mutable size_t m_length_in_code_points { NumericLimits::max() }; }; @@ -188,6 +349,13 @@ struct Traits : public DefaultTraits { } +[[nodiscard]] ALWAYS_INLINE AK_STRING_VIEW_LITERAL_CONSTEVAL AK::Utf16View operator""sv(char16_t const* string, size_t length) +{ + AK::Utf16View view { string, length }; + ASSERT(view.validate()); + return view; +} + #if USING_AK_GLOBALLY using AK::Utf16Data; using AK::Utf16View; diff --git a/Libraries/LibCore/ProcessWindows.cpp b/Libraries/LibCore/ProcessWindows.cpp index 5fd5566dc08..d66b7989da4 100644 --- a/Libraries/LibCore/ProcessWindows.cpp +++ b/Libraries/LibCore/ProcessWindows.cpp @@ -111,7 +111,7 @@ ErrorOr Process::get_name() if (!length) return Error::from_windows_error(); - return String::from_utf16(Utf16View { { (u16*)path, length } }); + return String::from_utf16(Utf16View { reinterpret_cast(path), length }); } ErrorOr Process::set_name(StringView, SetThreadName) diff --git a/Libraries/LibJS/Runtime/AbstractOperations.cpp b/Libraries/LibJS/Runtime/AbstractOperations.cpp index b410d6ed7fb..80b90c65a94 100644 --- a/Libraries/LibJS/Runtime/AbstractOperations.cpp +++ b/Libraries/LibJS/Runtime/AbstractOperations.cpp @@ -1271,33 +1271,33 @@ ThrowCompletionOr get_substitution(VM& vm, Utf16View const& matched, Utf Optional capture_string; // b. If templateRemainder starts with "$$", then - if (template_remainder.starts_with(u"$$")) { + if (template_remainder.starts_with(u"$$"sv)) { // i. Let ref be "$$". - ref = u"$$"; + ref = u"$$"sv; // ii. Let refReplacement be "$". - ref_replacement = u"$"; + ref_replacement = u"$"sv; } // c. Else if templateRemainder starts with "$`", then - else if (template_remainder.starts_with(u"$`")) { + else if (template_remainder.starts_with(u"$`"sv)) { // i. Let ref be "$`". - ref = u"$`"; + ref = u"$`"sv; // ii. Let refReplacement be the substring of str from 0 to position. ref_replacement = str.substring_view(0, position); } // d. Else if templateRemainder starts with "$&", then - else if (template_remainder.starts_with(u"$&")) { + else if (template_remainder.starts_with(u"$&"sv)) { // i. Let ref be "$&". - ref = u"$&"; + ref = u"$&"sv; // ii. Let refReplacement be matched. ref_replacement = matched; } // e. Else if templateRemainder starts with "$'" (0x0024 (DOLLAR SIGN) followed by 0x0027 (APOSTROPHE)), then - else if (template_remainder.starts_with(u"$'")) { + else if (template_remainder.starts_with(u"$'"sv)) { // i. Let ref be "$'". - ref = u"$'"; + ref = u"$'"sv; // ii. Let matchLength be the length of matched. auto match_length = matched.length_in_code_units(); @@ -1311,7 +1311,7 @@ ThrowCompletionOr get_substitution(VM& vm, Utf16View const& matched, Utf // v. NOTE: tailPos can exceed stringLength only if this abstract operation was invoked by a call to the intrinsic @@replace method of %RegExp.prototype% on an object whose "exec" property is not the intrinsic %RegExp.prototype.exec%. } // f. Else if templateRemainder starts with "$" followed by 1 or more decimal digits, then - else if (template_remainder.starts_with(u"$") && template_remainder.length_in_code_units() > 1 && is_ascii_digit(template_remainder.code_unit_at(1))) { + else if (template_remainder.starts_with(u"$"sv) && template_remainder.length_in_code_units() > 1 && is_ascii_digit(template_remainder.code_unit_at(1))) { // i. If templateRemainder starts with "$" followed by 2 or more decimal digits, let digitCount be 2. Otherwise, let digitCount be 1. size_t digit_count = 1; @@ -1373,15 +1373,15 @@ ThrowCompletionOr get_substitution(VM& vm, Utf16View const& matched, Utf } } // g. Else if templateRemainder starts with "$<", then - else if (template_remainder.starts_with(u"$<")) { + else if (template_remainder.starts_with(u"$<"sv)) { // i. Let gtPos be StringIndexOf(templateRemainder, ">", 0). // NOTE: We can actually start at index 2 because we know the string starts with "$<". - auto greater_than_position = string_index_of(template_remainder, u">", 2); + auto greater_than_position = string_index_of(template_remainder, u">"sv, 2); // ii. If gtPos = -1 or namedCaptures is undefined, then if (!greater_than_position.has_value() || named_captures.is_undefined()) { // 1. Let ref be "$<". - ref = u"$<"; + ref = u"$<"sv; // 2. Let refReplacement be ref. ref_replacement = ref; @@ -1427,7 +1427,7 @@ ThrowCompletionOr get_substitution(VM& vm, Utf16View const& matched, Utf auto ref_length = ref.length_in_code_units(); // k. Set result to the string-concatenation of result and refReplacement. - result.append(ref_replacement.data(), ref_replacement.length_in_code_points()); + result.append(ref_replacement.span().data(), ref_replacement.length_in_code_units()); // j. Set templateRemainder to the substring of templateRemainder from refLength. // NOTE: We do this step last because refReplacement may point to templateRemainder. diff --git a/Libraries/LibJS/Runtime/Utf16String.cpp b/Libraries/LibJS/Runtime/Utf16String.cpp index ba94c32a82c..461590b53ea 100644 --- a/Libraries/LibJS/Runtime/Utf16String.cpp +++ b/Libraries/LibJS/Runtime/Utf16String.cpp @@ -44,7 +44,7 @@ NonnullRefPtr Utf16StringImpl::create(Utf16View const& view) { Utf16Data string; string.ensure_capacity(view.length_in_code_units()); - string.unchecked_append(view.data(), view.length_in_code_units()); + string.unchecked_append(view.span().data(), view.length_in_code_units()); auto impl = create(move(string)); if (auto length_in_code_points = view.length_in_code_points_if_known(); length_in_code_points.has_value()) diff --git a/Libraries/LibJS/Runtime/Utf16String.h b/Libraries/LibJS/Runtime/Utf16String.h index 4e9fe940ce2..3dbee3c711f 100644 --- a/Libraries/LibJS/Runtime/Utf16String.h +++ b/Libraries/LibJS/Runtime/Utf16String.h @@ -48,7 +48,7 @@ private: mutable bool m_has_hash { false }; mutable u32 m_hash { 0 }; Utf16Data m_string; - Utf16View m_cached_view { m_string.span() }; + Utf16View m_cached_view { m_string }; }; } diff --git a/Libraries/LibRegex/RegexMatch.h b/Libraries/LibRegex/RegexMatch.h index c6200f87fec..f1860ae13c3 100644 --- a/Libraries/LibRegex/RegexMatch.h +++ b/Libraries/LibRegex/RegexMatch.h @@ -147,9 +147,8 @@ public: return Vector { view }; Vector views; - u16 newline = '\n'; while (!view.is_empty()) { - auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16)); + auto position = view.find_code_unit_offset(u'\n'); if (!position.has_value()) break; auto offset = position.value() / sizeof(u16); diff --git a/Libraries/LibUnicode/ICU.cpp b/Libraries/LibUnicode/ICU.cpp index 0b6cf867c29..6911c6b9409 100644 --- a/Libraries/LibUnicode/ICU.cpp +++ b/Libraries/LibUnicode/ICU.cpp @@ -159,8 +159,7 @@ String icu_string_to_string(icu::UnicodeString const& string) String icu_string_to_string(UChar const* string, i32 length) { - ReadonlySpan view { reinterpret_cast(string), static_cast(length) }; - return MUST(Utf16View { view }.to_utf8()); + return MUST(Utf16View { string, static_cast(length) }.to_utf8()); } } diff --git a/Libraries/LibUnicode/Segmenter.cpp b/Libraries/LibUnicode/Segmenter.cpp index 63023995d88..25cfa5e59f5 100644 --- a/Libraries/LibUnicode/Segmenter.cpp +++ b/Libraries/LibUnicode/Segmenter.cpp @@ -75,7 +75,7 @@ public: virtual void set_segmented_text(Utf16View const& text) override { - m_segmented_text = icu::UnicodeString { text.data(), static_cast(text.length_in_code_units()) }; + m_segmented_text = icu::UnicodeString { text.span().data(), static_cast(text.length_in_code_units()) }; m_segmenter->setText(m_segmented_text.get()); } diff --git a/Libraries/LibWeb/DOM/CharacterData.cpp b/Libraries/LibWeb/DOM/CharacterData.cpp index 3ad01cbc0a8..5125ab2b009 100644 --- a/Libraries/LibWeb/DOM/CharacterData.cpp +++ b/Libraries/LibWeb/DOM/CharacterData.cpp @@ -86,11 +86,12 @@ WebIDL::ExceptionOr CharacterData::replace_data(size_t offset, size_t coun auto before_data = utf16_view.substring_view(0, offset); auto inserted_data_result = MUST(AK::utf8_to_utf16(data)); auto after_data = utf16_view.substring_view(offset + count); + Utf16Data full_data; full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units()); - full_data.append(before_data.data(), before_data.length_in_code_units()); + full_data.append(before_data.span().data(), before_data.length_in_code_units()); full_data.extend(inserted_data_result.data); - full_data.append(after_data.data(), after_data.length_in_code_units()); + full_data.append(after_data.span().data(), after_data.length_in_code_units()); Utf16View full_view { full_data }; bool characters_are_the_same = utf16_view == full_view; diff --git a/Libraries/LibWeb/FileAPI/FileReader.cpp b/Libraries/LibWeb/FileAPI/FileReader.cpp index 64cbee1925d..6d6ec1baac6 100644 --- a/Libraries/LibWeb/FileAPI/FileReader.cpp +++ b/Libraries/LibWeb/FileAPI/FileReader.cpp @@ -106,7 +106,7 @@ WebIDL::ExceptionOr FileReader::blob_package_data(JS::Realm& return JS::ArrayBuffer::create(realm, move(bytes)); case Type::BinaryString: // Return bytes as a binary string, in which every byte is represented by a code unit of equal value [0..255]. - Vector builder; + Utf16Data builder; builder.ensure_capacity(bytes.size()); for (auto byte : bytes.bytes()) builder.unchecked_append(byte); diff --git a/Tests/AK/TestUtf16View.cpp b/Tests/AK/TestUtf16View.cpp index 90247c876bf..ba9cfe63aae 100644 --- a/Tests/AK/TestUtf16View.cpp +++ b/Tests/AK/TestUtf16View.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, Tim Flynn + * Copyright (c) 2021-2025, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -60,8 +60,7 @@ TEST_CASE(encode_utf8) EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No)), utf8_string); } { - auto encoded = Array { (u16)0xd83d }; - Utf16View view { encoded }; + Utf16View view { u"\xd83d"sv }; EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), "\xed\xa0\xbd"sv); EXPECT(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No).is_error()); } @@ -69,11 +68,8 @@ TEST_CASE(encode_utf8) TEST_CASE(decode_utf16) { - // Same string as the decode_utf8 test. - auto encoded = Array { (u16)0x041f, 0x0440, 0x0438, 0x0432, 0x0435, 0x0442, 0x002c, 0x0020, 0x043c, 0x0438, 0x0440, 0x0021, 0x0020, 0xd83d, 0xde00, 0x0020, 0x03b3, 0x03b5, 0x03b9, 0x03ac, 0x0020, 0x03c3, 0x03bf, 0x03c5, 0x0020, 0x03ba, 0x03cc, 0x03c3, 0x03bc, 0x03bf, 0x03c2, 0x0020, 0x3053, 0x3093, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c }; - - Utf16View view { encoded }; - EXPECT_EQ(encoded.size(), view.length_in_code_units()); + Utf16View view { u"Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv }; + EXPECT_EQ(view.length_in_code_units(), 39uz); size_t valid_code_units = 0; EXPECT(view.validate(valid_code_units)); @@ -113,18 +109,18 @@ TEST_CASE(null_view) TEST_CASE(utf16_literal) { { - Utf16View view { u"" }; + Utf16View view { u""sv }; EXPECT(view.validate()); EXPECT_EQ(view.length_in_code_units(), 0u); } { - Utf16View view { u"a" }; + Utf16View view { u"a"sv }; EXPECT(view.validate()); EXPECT_EQ(view.length_in_code_units(), 1u); EXPECT_EQ(view.code_unit_at(0), 0x61u); } { - Utf16View view { u"abc" }; + Utf16View view { u"abc"sv }; EXPECT(view.validate()); EXPECT_EQ(view.length_in_code_units(), 3u); EXPECT_EQ(view.code_unit_at(0), 0x61u); @@ -132,7 +128,7 @@ TEST_CASE(utf16_literal) EXPECT_EQ(view.code_unit_at(2), 0x63u); } { - Utf16View view { u"🙃" }; + Utf16View view { u"🙃"sv }; EXPECT(view.validate()); EXPECT_EQ(view.length_in_code_units(), 2u); EXPECT_EQ(view.code_unit_at(0), 0xd83du); @@ -190,14 +186,14 @@ TEST_CASE(validate_invalid_utf16) Utf16View invalid; { // Lonely high surrogate. - invalid = u"\xd800"; + invalid = u"\xd800"sv; EXPECT(!invalid.validate(valid_code_units)); EXPECT_EQ(valid_code_units, 0uz); EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); EXPECT_EQ(valid_code_units, 1uz); - invalid = u"\xdbff"; + invalid = u"\xdbff"sv; EXPECT(!invalid.validate(valid_code_units)); EXPECT_EQ(valid_code_units, 0uz); @@ -206,14 +202,14 @@ TEST_CASE(validate_invalid_utf16) } { // Lonely low surrogate. - invalid = u"\xdc00"; + invalid = u"\xdc00"sv; EXPECT(!invalid.validate(valid_code_units)); EXPECT_EQ(valid_code_units, 0uz); EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); EXPECT_EQ(valid_code_units, 1uz); - invalid = u"\xdfff"; + invalid = u"\xdfff"sv; EXPECT(!invalid.validate(valid_code_units)); EXPECT_EQ(valid_code_units, 0uz); @@ -222,14 +218,14 @@ TEST_CASE(validate_invalid_utf16) } { // High surrogate followed by non-surrogate. - invalid = u"\xd800\x0000"; + invalid = u"\xd800\x0000"sv; EXPECT(!invalid.validate(valid_code_units)); EXPECT_EQ(valid_code_units, 0uz); EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); EXPECT_EQ(valid_code_units, 2uz); - invalid = u"\xd800\xe000"; + invalid = u"\xd800\xe000"sv; EXPECT(!invalid.validate(valid_code_units)); EXPECT_EQ(valid_code_units, 0uz); @@ -238,14 +234,14 @@ TEST_CASE(validate_invalid_utf16) } { // High surrogate followed by high surrogate. - invalid = u"\xd800\xd800"; + invalid = u"\xd800\xd800"sv; EXPECT(!invalid.validate(valid_code_units)); EXPECT_EQ(valid_code_units, 0uz); EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); EXPECT_EQ(valid_code_units, 2uz); - invalid = u"\xd800\xdbff"; + invalid = u"\xd800\xdbff"sv; EXPECT(!invalid.validate(valid_code_units)); EXPECT_EQ(valid_code_units, 0uz); @@ -254,14 +250,14 @@ TEST_CASE(validate_invalid_utf16) } { // Valid UTF-16 followed by invalid code units. - invalid = u"\x0041\x0041\xd800"; + invalid = u"\x0041\x0041\xd800"sv; EXPECT(!invalid.validate(valid_code_units)); EXPECT_EQ(valid_code_units, 2uz); EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); EXPECT_EQ(valid_code_units, 3uz); - invalid = u"\x0041\x0041\xd800"; + invalid = u"\x0041\x0041\xd800"sv; EXPECT(!invalid.validate(valid_code_units)); EXPECT_EQ(valid_code_units, 2uz); @@ -274,10 +270,8 @@ TEST_CASE(decode_invalid_utf16) { { // Lonely high surrogate. - auto invalid = Array { (u16)0x41, 0x42, 0xd800 }; - - Utf16View view { invalid }; - EXPECT_EQ(invalid.size(), view.length_in_code_units()); + Utf16View view { u"AB\xd800"sv }; + EXPECT_EQ(view.length_in_code_units(), 3uz); auto expected = Array { (u32)0x41, 0x42, 0xfffd }; EXPECT_EQ(expected.size(), view.length_in_code_points()); @@ -290,10 +284,8 @@ TEST_CASE(decode_invalid_utf16) } { // Lonely low surrogate. - auto invalid = Array { (u16)0x41, 0x42, 0xdc00 }; - - Utf16View view { invalid }; - EXPECT_EQ(invalid.size(), view.length_in_code_units()); + Utf16View view { u"AB\xdc00"sv }; + EXPECT_EQ(view.length_in_code_units(), 3uz); auto expected = Array { (u32)0x41, 0x42, 0xfffd }; EXPECT_EQ(expected.size(), view.length_in_code_points()); @@ -306,10 +298,8 @@ TEST_CASE(decode_invalid_utf16) } { // High surrogate followed by non-surrogate. - auto invalid = Array { (u16)0x41, 0x42, 0xd800, 0 }; - - Utf16View view { invalid }; - EXPECT_EQ(invalid.size(), view.length_in_code_units()); + Utf16View view { u"AB\xd800\x0000"sv }; + EXPECT_EQ(view.length_in_code_units(), 4uz); auto expected = Array { (u32)0x41, 0x42, 0xfffd, 0 }; EXPECT_EQ(expected.size(), view.length_in_code_points()); @@ -322,10 +312,8 @@ TEST_CASE(decode_invalid_utf16) } { // High surrogate followed by high surrogate. - auto invalid = Array { (u16)0x41, 0x42, 0xd800, 0xd800 }; - - Utf16View view { invalid }; - EXPECT_EQ(invalid.size(), view.length_in_code_units()); + Utf16View view { u"AB\xd800\xd800"sv }; + EXPECT_EQ(view.length_in_code_units(), 4uz); auto expected = Array { (u32)0x41, 0x42, 0xfffd, 0xfffd }; EXPECT_EQ(expected.size(), view.length_in_code_points()); @@ -341,13 +329,13 @@ TEST_CASE(decode_invalid_utf16) TEST_CASE(is_ascii) { EXPECT(Utf16View {}.is_ascii()); - EXPECT(Utf16View { u"a" }.is_ascii()); - EXPECT(Utf16View { u"foo" }.is_ascii()); - EXPECT(Utf16View { u"foo\t\n\rbar\v\b123" }.is_ascii()); + EXPECT(u"a"sv.is_ascii()); + EXPECT(u"foo"sv.is_ascii()); + EXPECT(u"foo\t\n\rbar\v\b123"sv.is_ascii()); - EXPECT(!Utf16View { u"😀" }.is_ascii()); - EXPECT(!Utf16View { u"foo 😀" }.is_ascii()); - EXPECT(!Utf16View { u"😀 foo" }.is_ascii()); + EXPECT(!u"😀"sv.is_ascii()); + EXPECT(!u"foo 😀"sv.is_ascii()); + EXPECT(!u"😀 foo"sv.is_ascii()); } TEST_CASE(equals_ignoring_case) @@ -387,28 +375,28 @@ TEST_CASE(substring_view) TEST_CASE(starts_with) { - EXPECT(Utf16View {}.starts_with(u"")); - EXPECT(!Utf16View {}.starts_with(u" ")); + EXPECT(Utf16View {}.starts_with(u""sv)); + EXPECT(!Utf16View {}.starts_with(u" "sv)); - EXPECT(Utf16View { u"a" }.starts_with(u"")); - EXPECT(Utf16View { u"a" }.starts_with(u"a")); - EXPECT(!Utf16View { u"a" }.starts_with(u"b")); - EXPECT(!Utf16View { u"a" }.starts_with(u"ab")); + EXPECT(u"a"sv.starts_with(u""sv)); + EXPECT(u"a"sv.starts_with(u"a"sv)); + EXPECT(!u"a"sv.starts_with(u"b"sv)); + EXPECT(!u"a"sv.starts_with(u"ab"sv)); - EXPECT(Utf16View { u"abc" }.starts_with(u"")); - EXPECT(Utf16View { u"abc" }.starts_with(u"a")); - EXPECT(Utf16View { u"abc" }.starts_with(u"ab")); - EXPECT(Utf16View { u"abc" }.starts_with(u"abc")); - EXPECT(!Utf16View { u"abc" }.starts_with(u"b")); - EXPECT(!Utf16View { u"abc" }.starts_with(u"bc")); + EXPECT(u"abc"sv.starts_with(u""sv)); + EXPECT(u"abc"sv.starts_with(u"a"sv)); + EXPECT(u"abc"sv.starts_with(u"ab"sv)); + EXPECT(u"abc"sv.starts_with(u"abc"sv)); + EXPECT(!u"abc"sv.starts_with(u"b"sv)); + EXPECT(!u"abc"sv.starts_with(u"bc"sv)); - auto emoji = Utf16View { u"😀🙃" }; + auto emoji = u"😀🙃"sv; - EXPECT(emoji.starts_with(u"")); - EXPECT(emoji.starts_with(u"😀")); - EXPECT(emoji.starts_with(u"😀🙃")); - EXPECT(!emoji.starts_with(u"a")); - EXPECT(!emoji.starts_with(u"🙃")); + EXPECT(emoji.starts_with(u""sv)); + EXPECT(emoji.starts_with(u"😀"sv)); + EXPECT(emoji.starts_with(u"😀🙃"sv)); + EXPECT(!emoji.starts_with(u"a"sv)); + EXPECT(!emoji.starts_with(u"🙃"sv)); } TEST_CASE(find_code_unit_offset) @@ -416,16 +404,16 @@ TEST_CASE(find_code_unit_offset) auto conversion_result = MUST(AK::utf8_to_utf16("😀foo😀bar"sv)); Utf16View const view { conversion_result }; - EXPECT_EQ(0u, view.find_code_unit_offset(u"").value()); - EXPECT_EQ(4u, view.find_code_unit_offset(u"", 4).value()); - EXPECT(!view.find_code_unit_offset(u"", 16).has_value()); + EXPECT_EQ(0u, view.find_code_unit_offset(u""sv).value()); + EXPECT_EQ(4u, view.find_code_unit_offset(u""sv, 4).value()); + EXPECT(!view.find_code_unit_offset(u""sv, 16).has_value()); - EXPECT_EQ(0u, view.find_code_unit_offset(u"😀").value()); - EXPECT_EQ(5u, view.find_code_unit_offset(u"😀", 1).value()); - EXPECT_EQ(2u, view.find_code_unit_offset(u"foo").value()); - EXPECT_EQ(7u, view.find_code_unit_offset(u"bar").value()); + EXPECT_EQ(0u, view.find_code_unit_offset(u"😀"sv).value()); + EXPECT_EQ(5u, view.find_code_unit_offset(u"😀"sv, 1).value()); + EXPECT_EQ(2u, view.find_code_unit_offset(u"foo"sv).value()); + EXPECT_EQ(7u, view.find_code_unit_offset(u"bar"sv).value()); - EXPECT(!view.find_code_unit_offset(u"baz").has_value()); + EXPECT(!view.find_code_unit_offset(u"baz"sv).has_value()); } TEST_CASE(find_code_unit_offset_ignoring_case) @@ -433,13 +421,13 @@ TEST_CASE(find_code_unit_offset_ignoring_case) auto conversion_result = MUST(AK::utf8_to_utf16("😀Foo😀Bar"sv)); Utf16View const view { conversion_result }; - EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u"").value()); - EXPECT_EQ(4u, view.find_code_unit_offset_ignoring_case(u"", 4).value()); - EXPECT(!view.find_code_unit_offset_ignoring_case(u"", 16).has_value()); + EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u""sv).value()); + EXPECT_EQ(4u, view.find_code_unit_offset_ignoring_case(u""sv, 4).value()); + EXPECT(!view.find_code_unit_offset_ignoring_case(u""sv, 16).has_value()); - EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u"😀").value()); - EXPECT_EQ(5u, view.find_code_unit_offset_ignoring_case(u"😀", 1).value()); - EXPECT_EQ(2u, view.find_code_unit_offset_ignoring_case(u"foO").value()); - EXPECT_EQ(7u, view.find_code_unit_offset_ignoring_case(u"baR").value()); - EXPECT(!view.find_code_unit_offset_ignoring_case(u"baz").has_value()); + EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u"😀"sv).value()); + EXPECT_EQ(5u, view.find_code_unit_offset_ignoring_case(u"😀"sv, 1).value()); + EXPECT_EQ(2u, view.find_code_unit_offset_ignoring_case(u"foO"sv).value()); + EXPECT_EQ(7u, view.find_code_unit_offset_ignoring_case(u"baR"sv).value()); + EXPECT(!view.find_code_unit_offset_ignoring_case(u"baz"sv).has_value()); }