diff --git a/AK/GenericLexer.cpp b/AK/GenericLexer.cpp index 47c4edefb3d..4ecef4de954 100644 --- a/AK/GenericLexer.cpp +++ b/AK/GenericLexer.cpp @@ -10,7 +10,7 @@ #include #include #include -#include +#include namespace AK { @@ -266,7 +266,7 @@ auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pair auto high_surrogate = decode_one_surrogate(); if (!high_surrogate.has_value()) return UnicodeEscapeError::MalformedUnicodeEscape; - if (!Utf16View::is_high_surrogate(*high_surrogate)) + if (!UnicodeUtils::is_utf16_high_surrogate(*high_surrogate)) return *high_surrogate; if (!combine_surrogate_pairs || !consume_specific("\\u"sv)) return *high_surrogate; @@ -274,8 +274,8 @@ auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pair auto low_surrogate = decode_one_surrogate(); if (!low_surrogate.has_value()) return UnicodeEscapeError::MalformedUnicodeEscape; - if (Utf16View::is_low_surrogate(*low_surrogate)) - return Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate); + if (UnicodeUtils::is_utf16_low_surrogate(*low_surrogate)) + return UnicodeUtils::decode_utf16_surrogate_pair(*high_surrogate, *low_surrogate); retreat(6); return *high_surrogate; diff --git a/AK/StringBuilder.cpp b/AK/StringBuilder.cpp index aabcd6fc8bb..55ff609ef12 100644 --- a/AK/StringBuilder.cpp +++ b/AK/StringBuilder.cpp @@ -283,7 +283,7 @@ ErrorOr StringBuilder::try_append(Utf16View const& utf16_view) uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 12) & 0x0f) | 0xe0); uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 6) & 0x3f) | 0x80); uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 0) & 0x3f) | 0x80); - } while (first_invalid_code_unit < remaining_view.length_in_code_units() && Utf16View::is_low_surrogate(remaining_view.data()[first_invalid_code_unit])); + } while (first_invalid_code_unit < remaining_view.length_in_code_units() && UnicodeUtils::is_utf16_low_surrogate(remaining_view.data()[first_invalid_code_unit])); // Code unit might no longer be invalid, retry on the remaining data. m_buffer.set_size(m_buffer.size() + bytes_just_written); diff --git a/AK/UnicodeUtils.h b/AK/UnicodeUtils.h index 7c57a262cd4..a9518b55314 100644 --- a/AK/UnicodeUtils.h +++ b/AK/UnicodeUtils.h @@ -78,6 +78,82 @@ template Callback> return -1; } +constexpr inline u16 HIGH_SURROGATE_MIN = 0xd800; +constexpr inline u16 HIGH_SURROGATE_MAX = 0xdbff; +constexpr inline u16 LOW_SURROGATE_MIN = 0xdc00; +constexpr inline u16 LOW_SURROGATE_MAX = 0xdfff; +constexpr inline u32 REPLACEMENT_CODE_POINT = 0xfffd; +constexpr inline u32 FIRST_SUPPLEMENTARY_PLANE_CODE_POINT = 0x10000; + +[[nodiscard]] constexpr size_t code_unit_length_for_code_point(u32 code_point) +{ + return code_point < FIRST_SUPPLEMENTARY_PLANE_CODE_POINT ? 1uz : 2uz; +} + +[[nodiscard]] constexpr bool is_utf16_high_surrogate(u16 code_unit) +{ + return (code_unit >= HIGH_SURROGATE_MIN) && (code_unit <= HIGH_SURROGATE_MAX); +} + +[[nodiscard]] constexpr bool is_utf16_low_surrogate(u16 code_unit) +{ + return (code_unit >= LOW_SURROGATE_MIN) && (code_unit <= LOW_SURROGATE_MAX); +} + +[[nodiscard]] constexpr u32 decode_utf16_surrogate_pair(u16 high_surrogate, u16 low_surrogate) +{ + VERIFY(is_utf16_high_surrogate(high_surrogate)); + VERIFY(is_utf16_low_surrogate(low_surrogate)); + + return ((high_surrogate - HIGH_SURROGATE_MIN) << 10) + (low_surrogate - LOW_SURROGATE_MIN) + FIRST_SUPPLEMENTARY_PLANE_CODE_POINT; +} + +template +[[nodiscard]] constexpr size_t code_point_to_utf16(u32 code_point, Callback callback) +{ + if (code_point < FIRST_SUPPLEMENTARY_PLANE_CODE_POINT) { + callback(static_cast(code_point)); + return 1uz; + } + + if (code_point <= 0x10ffff) { + code_point -= FIRST_SUPPLEMENTARY_PLANE_CODE_POINT; + + auto code_unit = static_cast(HIGH_SURROGATE_MIN | (code_point >> 10)); + callback(static_cast(code_unit)); + + code_unit = static_cast(LOW_SURROGATE_MIN | (code_point & 0x3ff)); + callback(static_cast(code_unit)); + + return 2uz; + } + + VERIFY_NOT_REACHED(); +} + +template Callback> +constexpr ErrorOr try_code_point_to_utf16(u32 code_point, Callback callback) +{ + if (code_point < FIRST_SUPPLEMENTARY_PLANE_CODE_POINT) { + TRY(callback(static_cast(code_point))); + return 1uz; + } + + if (code_point <= 0x10ffff) { + code_point -= FIRST_SUPPLEMENTARY_PLANE_CODE_POINT; + + auto code_unit = static_cast(HIGH_SURROGATE_MIN | (code_point >> 10)); + TRY(callback(static_cast(code_unit))); + + code_unit = static_cast(LOW_SURROGATE_MIN | (code_point & 0x3ff)); + TRY(callback(static_cast(code_unit))); + + return 2uz; + } + + VERIFY_NOT_REACHED(); +} + /** * Compute the maximum number of UTF-8 bytes needed to store a given UTF-16 string, accounting for unmatched UTF-16 surrogates. * This function will overcount by at most 33%; 2 bytes for every valid UTF-16 codepoint between U+100000 and U+10FFFF. diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index e2008cee3dd..1e1b59f8d15 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, Tim Flynn + * Copyright (c) 2021-2025, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -16,13 +16,6 @@ namespace AK { -static constexpr u16 high_surrogate_min = 0xd800; -static constexpr u16 high_surrogate_max = 0xdbff; -static constexpr u16 low_surrogate_min = 0xdc00; -static constexpr u16 low_surrogate_max = 0xdfff; -static constexpr u32 replacement_code_point = 0xfffd; -static constexpr u32 first_supplementary_plane_code_point = 0x10000; - static constexpr u16 host_code_unit(u16 code_unit, Endianness endianness) { switch (endianness) { @@ -44,7 +37,11 @@ static ErrorOr to_utf16_slow(UtfViewType const& view, End size_t code_point_count = 0; for (auto code_point : view) { - TRY(code_point_to_utf16(utf16_data, code_point, endianness)); + TRY(UnicodeUtils::try_code_point_to_utf16(code_point, [&](auto code_unit) -> ErrorOr { + TRY(utf16_data.try_append(host_code_unit(code_unit, endianness))); + return {}; + })); + code_point_count++; } @@ -116,48 +113,11 @@ ErrorOr utf32_to_utf16(Utf32View const& utf32_view, Endia return Utf16ConversionResult { utf16_data, length }; } -ErrorOr code_point_to_utf16(Utf16Data& string, u32 code_point, Endianness endianness) -{ - VERIFY(is_unicode(code_point)); - - if (code_point < first_supplementary_plane_code_point) { - TRY(string.try_append(host_code_unit(static_cast(code_point), endianness))); - } else { - code_point -= first_supplementary_plane_code_point; - - auto code_unit = static_cast(high_surrogate_min | (code_point >> 10)); - TRY(string.try_append(host_code_unit(code_unit, endianness))); - - code_unit = static_cast(low_surrogate_min | (code_point & 0x3ff)); - TRY(string.try_append(host_code_unit(code_unit, endianness))); - } - - return {}; -} - size_t utf16_code_unit_length_from_utf8(StringView string) { return simdutf::utf16_length_from_utf8(string.characters_without_null_termination(), string.length()); } -bool Utf16View::is_high_surrogate(u16 code_unit) -{ - return (code_unit >= high_surrogate_min) && (code_unit <= high_surrogate_max); -} - -bool Utf16View::is_low_surrogate(u16 code_unit) -{ - return (code_unit >= low_surrogate_min) && (code_unit <= low_surrogate_max); -} - -u32 Utf16View::decode_surrogate_pair(u16 high_surrogate, u16 low_surrogate) -{ - VERIFY(is_high_surrogate(high_surrogate)); - VERIFY(is_low_surrogate(low_surrogate)); - - return ((high_surrogate - high_surrogate_min) << 10) + (low_surrogate - low_surrogate_min) + first_supplementary_plane_code_point; -} - ErrorOr Utf16View::to_byte_string(AllowInvalidCodeUnits allow_invalid_code_units) const { return TRY(to_utf8(allow_invalid_code_units)).to_byte_string(); @@ -191,16 +151,16 @@ u32 Utf16View::code_point_at(size_t index) const VERIFY(index < length_in_code_units()); u32 code_point = code_unit_at(index); - if (!is_high_surrogate(code_point) && !is_low_surrogate(code_point)) + if (!UnicodeUtils::is_utf16_high_surrogate(code_point) && !UnicodeUtils::is_utf16_low_surrogate(code_point)) return code_point; - if (is_low_surrogate(code_point) || (index + 1 == length_in_code_units())) + if (UnicodeUtils::is_utf16_low_surrogate(code_point) || (index + 1 == length_in_code_units())) return code_point; auto second = code_unit_at(index + 1); - if (!is_low_surrogate(second)) + if (!UnicodeUtils::is_utf16_low_surrogate(second)) return code_point; - return decode_surrogate_pair(code_point, second); + return UnicodeUtils::decode_utf16_surrogate_pair(code_point, second); } size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const @@ -418,28 +378,23 @@ u32 Utf16CodePointIterator::operator*() const auto code_unit = host_code_unit(*m_ptr, Endianness::Host); - if (Utf16View::is_high_surrogate(code_unit)) { + if (UnicodeUtils::is_utf16_high_surrogate(code_unit)) { if (m_remaining_code_units > 1) { auto next_code_unit = host_code_unit(*(m_ptr + 1), Endianness::Host); - if (Utf16View::is_low_surrogate(next_code_unit)) - return Utf16View::decode_surrogate_pair(code_unit, next_code_unit); + if (UnicodeUtils::is_utf16_low_surrogate(next_code_unit)) + return UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit); } - return replacement_code_point; + return UnicodeUtils::REPLACEMENT_CODE_POINT; } - if (Utf16View::is_low_surrogate(code_unit)) - return replacement_code_point; + if (UnicodeUtils::is_utf16_low_surrogate(code_unit)) + return UnicodeUtils::REPLACEMENT_CODE_POINT; return static_cast(code_unit); } -size_t Utf16CodePointIterator::length_in_code_units() const -{ - return *(*this) < first_supplementary_plane_code_point ? 1 : 2; -} - bool validate_utf16_le(ReadonlyBytes bytes) { return simdutf::validate_utf16le(reinterpret_cast(bytes.data()), bytes.size() / 2); diff --git a/AK/Utf16View.h b/AK/Utf16View.h index aaba8b20aef..d38aff5788e 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, Tim Flynn + * Copyright (c) 2021-2025, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -15,6 +15,7 @@ #include #include #include +#include #include namespace AK { @@ -28,7 +29,6 @@ struct Utf16ConversionResult { ErrorOr utf8_to_utf16(StringView, Endianness = Endianness::Host); ErrorOr utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host); ErrorOr utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host); -ErrorOr code_point_to_utf16(Utf16Data&, u32, Endianness = Endianness::Host); [[nodiscard]] bool validate_utf16_le(ReadonlyBytes); [[nodiscard]] bool validate_utf16_be(ReadonlyBytes); @@ -52,7 +52,10 @@ public: Utf16CodePointIterator& operator++(); u32 operator*() const; - size_t length_in_code_units() const; + size_t length_in_code_units() const + { + return UnicodeUtils::code_unit_length_for_code_point(**this); + } private: Utf16CodePointIterator(u16 const* ptr, size_t length) @@ -69,10 +72,6 @@ class Utf16View { public: using Iterator = Utf16CodePointIterator; - static bool is_high_surrogate(u16); - static bool is_low_surrogate(u16); - static u32 decode_surrogate_pair(u16 high_surrogate, u16 low_surrogate); - Utf16View() = default; ~Utf16View() = default; diff --git a/Libraries/LibJS/Parser.cpp b/Libraries/LibJS/Parser.cpp index 80cde29b8e4..3f5226df21e 100644 --- a/Libraries/LibJS/Parser.cpp +++ b/Libraries/LibJS/Parser.cpp @@ -9,13 +9,14 @@ * SPDX-License-Identifier: BSD-2-Clause */ -#include "Parser.h" #include #include #include #include #include #include +#include +#include #include #include @@ -4601,7 +4602,7 @@ FlyString Parser::consume_string_value() Utf8View view { value.bytes_as_string_view().substring_view(value.bytes().size() - 3) }; VERIFY(view.length() <= 3); auto codepoint = *view.begin(); - if (Utf16View::is_high_surrogate(codepoint)) { + if (AK::UnicodeUtils::is_utf16_high_surrogate(codepoint)) { syntax_error("StringValue ending with unpaired high surrogate"_string); VERIFY(view.length() == 1); } diff --git a/Libraries/LibJS/Runtime/PrimitiveString.cpp b/Libraries/LibJS/Runtime/PrimitiveString.cpp index 3a156123ed4..dad65f03838 100644 --- a/Libraries/LibJS/Runtime/PrimitiveString.cpp +++ b/Libraries/LibJS/Runtime/PrimitiveString.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -308,7 +309,7 @@ void RopeString::resolve(EncodingPreference preference) const auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin(); auto low_surrogate = *Utf8View(current_string_as_utf8).begin(); - if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) { + if (!AK::UnicodeUtils::is_utf16_high_surrogate(high_surrogate) || !AK::UnicodeUtils::is_utf16_low_surrogate(low_surrogate)) { builder.append(current_string_as_utf8); previous = current; continue; @@ -316,7 +317,7 @@ void RopeString::resolve(EncodingPreference preference) const // Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point. builder.trim(3); - builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate)); + builder.append_code_point(AK::UnicodeUtils::decode_utf16_surrogate_pair(high_surrogate, low_surrogate)); // Append the remaining part of the current string. builder.append(current_string_as_utf8.substring_view(3)); diff --git a/Libraries/LibJS/Runtime/StringConstructor.cpp b/Libraries/LibJS/Runtime/StringConstructor.cpp index 1da468c6371..e34ce300c6f 100644 --- a/Libraries/LibJS/Runtime/StringConstructor.cpp +++ b/Libraries/LibJS/Runtime/StringConstructor.cpp @@ -5,8 +5,8 @@ */ #include +#include #include -#include #include #include #include @@ -129,7 +129,9 @@ JS_DEFINE_NATIVE_FUNCTION(StringConstructor::from_code_point) return vm.throw_completion(ErrorType::InvalidCodePoint, next_code_point.to_string_without_side_effects()); // d. Set result to the string-concatenation of result and UTF16EncodeCodePoint(ℝ(nextCP)). - MUST(code_point_to_utf16(string, static_cast(code_point))); + (void)AK::UnicodeUtils::code_point_to_utf16(static_cast(code_point), [&](auto code_unit) { + string.append(code_unit); + }); } // 3. Assert: If codePoints is empty, then result is the empty String. diff --git a/Libraries/LibJS/Runtime/StringPrototype.cpp b/Libraries/LibJS/Runtime/StringPrototype.cpp index 9e5efbdb2dd..f027b26cdd8 100644 --- a/Libraries/LibJS/Runtime/StringPrototype.cpp +++ b/Libraries/LibJS/Runtime/StringPrototype.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -121,7 +122,7 @@ CodePoint code_point_at(Utf16View const& string, size_t position) } // 6. If first is a trailing surrogate or position + 1 = size, then - if (Utf16View::is_low_surrogate(first) || (position + 1 == string.length_in_code_units())) { + if (AK::UnicodeUtils::is_utf16_low_surrogate(first) || (position + 1 == string.length_in_code_units())) { // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }. return { true, code_point, 1 }; } @@ -130,13 +131,13 @@ CodePoint code_point_at(Utf16View const& string, size_t position) auto second = string.code_unit_at(position + 1); // 8. If second is not a trailing surrogate, then - if (!Utf16View::is_low_surrogate(second)) { + if (!AK::UnicodeUtils::is_utf16_low_surrogate(second)) { // a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }. return { true, code_point, 1 }; } // 9. Set cp to UTF16SurrogatePairToCodePoint(first, second). - code_point = Utf16View::decode_surrogate_pair(first, second); + code_point = AK::UnicodeUtils::decode_utf16_surrogate_pair(first, second); // 10. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 2, [[IsUnpairedSurrogate]]: false }. return { false, code_point, 2 }; diff --git a/Libraries/LibWeb/Encoding/TextEncoderStream.cpp b/Libraries/LibWeb/Encoding/TextEncoderStream.cpp index 5aa87b37a64..78b8caaf822 100644 --- a/Libraries/LibWeb/Encoding/TextEncoderStream.cpp +++ b/Libraries/LibWeb/Encoding/TextEncoderStream.cpp @@ -184,11 +184,11 @@ Optional TextEncoderStream::convert_code_unit_to_scalar_value(u32 item, Utf // 3. If item is a trailing surrogate, then return a scalar value from surrogates given leadingSurrogate // and item. - if (Utf16View::is_low_surrogate(item)) { + if (AK::UnicodeUtils::is_utf16_low_surrogate(item)) { // https://encoding.spec.whatwg.org/#scalar-value-from-surrogates // To obtain a scalar value from surrogates, given a leading surrogate leading and a trailing surrogate // trailing, return 0x10000 + ((leading − 0xD800) << 10) + (trailing − 0xDC00). - return Utf16View::decode_surrogate_pair(leading_surrogate, item); + return AK::UnicodeUtils::decode_utf16_surrogate_pair(leading_surrogate, item); } // 4. Restore item to input. @@ -199,13 +199,13 @@ Optional TextEncoderStream::convert_code_unit_to_scalar_value(u32 item, Utf } // 2. If item is a leading surrogate, then set encoder’s leading surrogate to item and return continue. - if (Utf16View::is_high_surrogate(item)) { + if (AK::UnicodeUtils::is_utf16_high_surrogate(item)) { m_leading_surrogate = item; return OptionalNone {}; } // 3. If item is a trailing surrogate, then return U+FFFD. - if (Utf16View::is_low_surrogate(item)) + if (AK::UnicodeUtils::is_utf16_low_surrogate(item)) return 0xFFFD; // 4. Return item.