AK+Everywhere: Prepare Utf16View for integration with a UTF-16 string

To prepare for an upcoming Utf16String, this migrates Utf16View to store its data as a char16_t. Most function definitions are moved inline and made constexpr. This also adds a UDL to construct a Utf16View from a string literal: auto string = u"hello"sv; This let's us remove the NTTP Utf16View constructor, as we have found that such constructors bloat binary size quite a bit.
Author: https://github.com/trflynn89 Commit: 86b1c78c1a Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5228 Reviewed-by: https://github.com/ADKaster ✅ Reviewed-by: https://github.com/shannonbooth
2025-08-09 09:39:39 +00:00 · 2025-06-26 12:52:23 -04:00 · 2025-06-26 12:52:23 -04:00 · 86b1c78c1a · 2025-07-03 13:53:23 +00:00
commit 86b1c78c1a
parent c17b067e1d
17 changed files with 406 additions and 421 deletions
--- a/AK/Utf16View.cpp
+++ b/AK/Utf16View.cpp
@ -80,77 +80,75 @@ ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const& utf32_view)
    return Utf16ConversionResult { utf16_data, length };
 }

+bool validate_utf16_le(ReadonlyBytes bytes)
+{
+    return simdutf::validate_utf16le(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
+}
+
+bool validate_utf16_be(ReadonlyBytes bytes)
+{
+    return simdutf::validate_utf16be(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
+}
+
 size_t utf16_code_unit_length_from_utf8(StringView string)
 {
    return simdutf::utf16_length_from_utf8(string.characters_without_null_termination(), string.length());
 }

+ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const
+{
+    if (is_empty())
+        return String {};
+    if (!validate(allow_invalid_code_units))
+        return Error::from_string_literal("Input was not valid UTF-16");
+
+    if (allow_invalid_code_units == AllowInvalidCodeUnits::No) {
+        String result;
+        auto utf8_length = simdutf::utf8_length_from_utf16(m_string, length_in_code_units());
+
+        TRY(result.replace_with_new_string(Badge<Utf16View> {}, utf8_length, [&](Bytes buffer) -> ErrorOr<void> {
+            [[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(m_string, length_in_code_units(), reinterpret_cast<char*>(buffer.data()));
+            ASSERT(result == buffer.size());
+            return {};
+        }));
+
+        return result;
+    }
+
+    StringBuilder builder;
+    builder.append(*this);
+    return builder.to_string();
+}
+
 ErrorOr<ByteString> Utf16View::to_byte_string(AllowInvalidCodeUnits allow_invalid_code_units) const
 {
    return TRY(to_utf8(allow_invalid_code_units)).to_byte_string();
 }

-ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const
-{
-    if (allow_invalid_code_units == AllowInvalidCodeUnits::No)
-        return String::from_utf16(*this);
-
-    StringBuilder builder;
-    builder.append(*this);
-    return builder.to_string();
-}
-
 bool Utf16View::is_ascii() const
 {
-    return simdutf::validate_ascii(reinterpret_cast<char const*>(m_code_units.data()), length_in_code_units() * sizeof(char16_t));
+    return simdutf::validate_ascii(reinterpret_cast<char const*>(m_string), length_in_code_units() * sizeof(char16_t));
 }

-size_t Utf16View::length_in_code_points() const
+bool Utf16View::validate(size_t& valid_code_units, AllowInvalidCodeUnits allow_invalid_code_units) const
 {
-    if (m_length_in_code_points == NumericLimits<size_t>::max())
-        m_length_in_code_points = calculate_length_in_code_points();
-    return m_length_in_code_points;
-}
+    auto view = *this;
+    valid_code_units = 0;

-u16 Utf16View::code_unit_at(size_t index) const
-{
-    VERIFY(index < length_in_code_units());
-    return m_code_units[index];
-}
+    while (!view.is_empty()) {
+        auto result = simdutf::validate_utf16_with_errors(view.m_string, view.length_in_code_units());
+        valid_code_units += result.count;

-u32 Utf16View::code_point_at(size_t index) const
-{
-    VERIFY(index < length_in_code_units());
+        if (result.error == simdutf::SUCCESS)
+            return true;
+        if (allow_invalid_code_units == AllowInvalidCodeUnits::No || result.error != simdutf::SURROGATE)
+            return false;

-    u32 code_point = code_unit_at(index);
-    if (!UnicodeUtils::is_utf16_high_surrogate(code_point) && !UnicodeUtils::is_utf16_low_surrogate(code_point))
-        return code_point;
-    if (UnicodeUtils::is_utf16_low_surrogate(code_point) || (index + 1 == length_in_code_units()))
-        return code_point;
-
-    auto second = code_unit_at(index + 1);
-    if (!UnicodeUtils::is_utf16_low_surrogate(second))
-        return code_point;
-
-    return UnicodeUtils::decode_utf16_surrogate_pair(code_point, second);
-}
-
-size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
-{
-    if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit.
-        return code_unit_offset;
-
-    size_t code_point_offset = 0;
-
-    for (auto it = begin(); it != end(); ++it) {
-        if (code_unit_offset == 0)
-            return code_point_offset;
-
-        code_unit_offset -= it.length_in_code_units();
-        ++code_point_offset;
+        view = view.substring_view(result.count + 1);
+        ++valid_code_units;
    }

-    return code_point_offset;
+    return true;
 }

 size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const
@ -171,19 +169,22 @@ size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const
    return code_unit_offset;
 }

-size_t Utf16View::code_unit_offset_of(Utf16CodePointIterator const& it) const
+size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
 {
-    VERIFY(it.m_ptr >= begin_ptr());
-    VERIFY(it.m_ptr <= end_ptr());
+    if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit.
+        return code_unit_offset;

-    return it.m_ptr - begin_ptr();
-}
+    size_t code_point_offset = 0;

-Utf16View Utf16View::substring_view(size_t code_unit_offset, size_t code_unit_length) const
-{
-    VERIFY(!Checked<size_t>::addition_would_overflow(code_unit_offset, code_unit_length));
+    for (auto it = begin(); it != end(); ++it) {
+        if (code_unit_offset == 0)
+            return code_point_offset;

-    return Utf16View { m_code_units.slice(code_unit_offset, code_unit_length) };
+        code_unit_offset -= it.length_in_code_units();
+        ++code_point_offset;
+    }
+
+    return code_point_offset;
 }

 Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t code_point_length) const
@ -194,7 +195,10 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
    if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit.
        return substring_view(code_point_offset, code_point_length);

-    auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) { return it.m_ptr - begin_ptr(); };
+    auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) {
+        return it.m_iterator - m_string;
+    };
+
    size_t code_point_index = 0;
    size_t code_unit_offset = 0;

@ -213,101 +217,13 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
    VERIFY_NOT_REACHED();
 }

-Optional<size_t> Utf16View::find_code_unit_offset(Utf16View const& needle, size_t start_offset) const
-{
-    return m_code_units.index_of(needle.m_code_units, start_offset);
-}
-
-Optional<size_t> Utf16View::find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset) const
-{
-    Checked maximum_offset { start_offset };
-    maximum_offset += needle.length_in_code_units();
-    if (maximum_offset.has_overflow() || maximum_offset.value() > length_in_code_units())
-        return {};
-
-    if (needle.is_empty())
-        return start_offset;
-
-    size_t index = start_offset;
-    while (index <= length_in_code_units() - needle.length_in_code_units()) {
-        Utf16View const slice { m_code_units.slice(index, needle.length_in_code_units()) };
-        if (slice.equals_ignoring_case(needle))
-            return index;
-        index += slice.begin().length_in_code_units();
-    }
-
-    return {};
-}
-
-bool Utf16View::starts_with(Utf16View const& needle) const
-{
-    if (needle.is_empty())
-        return true;
-    if (is_empty())
-        return false;
-    if (needle.length_in_code_units() > length_in_code_units())
-        return false;
-    if (begin_ptr() == needle.begin_ptr())
-        return true;
-
-    for (auto this_it = begin(), needle_it = needle.begin(); needle_it != needle.end(); ++needle_it, ++this_it) {
-        if (*this_it != *needle_it)
-            return false;
-    }
-
-    return true;
-}
-
-// https://infra.spec.whatwg.org/#code-unit-less-than
-bool Utf16View::is_code_unit_less_than(Utf16View const& other) const
-{
-    auto a = m_code_units;
-    auto b = other.m_code_units;
-
-    auto common_length = min(a.size(), b.size());
-
-    for (size_t position = 0; position < common_length; ++position) {
-        if (a[position] != b[position])
-            return a[position] < b[position];
-    }
-
-    return a.size() < b.size();
-}
-
-bool Utf16View::validate(AllowInvalidCodeUnits allow_invalid_code_units) const
-{
-    size_t valid_code_units = 0;
-    return validate(valid_code_units, allow_invalid_code_units);
-}
-
-bool Utf16View::validate(size_t& valid_code_units, AllowInvalidCodeUnits allow_invalid_code_units) const
-{
-    auto view = *this;
-    valid_code_units = 0;
-
-    while (!view.is_empty()) {
-        auto result = simdutf::validate_utf16_with_errors(view.char_data(), view.length_in_code_units());
-        valid_code_units += result.count;
-
-        if (result.error == simdutf::SUCCESS)
-            return true;
-        if (allow_invalid_code_units == AllowInvalidCodeUnits::No || result.error != simdutf::SURROGATE)
-            return false;
-
-        view = view.substring_view(result.count + 1);
-        ++valid_code_units;
-    }
-
-    return true;
-}
-
 size_t Utf16View::calculate_length_in_code_points() const
 {
    // FIXME: simdutf's code point length method assumes valid UTF-16, whereas Utf16View uses U+FFFD as a replacement
    //        for invalid code points. If we change Utf16View to only accept valid encodings as an invariant, we can
    //        remove this branch.
    if (validate()) [[likely]]
-        return simdutf::count_utf16(char_data(), length_in_code_units());
+        return simdutf::count_utf16(m_string, length_in_code_units());

    size_t code_points = 0;
    for ([[maybe_unused]] auto code_point : *this)
@ -315,81 +231,4 @@ size_t Utf16View::calculate_length_in_code_points() const
    return code_points;
 }

-bool Utf16View::equals_ignoring_case(Utf16View const& other) const
-{
-    if (length_in_code_units() != other.length_in_code_units())
-        return false;
-
-    for (size_t i = 0; i < length_in_code_units(); ++i) {
-        // FIXME: Handle non-ASCII case insensitive comparisons.
-        if (to_ascii_lowercase(m_code_units[i]) != to_ascii_lowercase(other.m_code_units[i]))
-            return false;
-    }
-
-    return true;
-}
-
-Utf16CodePointIterator& Utf16CodePointIterator::operator++()
-{
-    size_t code_units = length_in_code_units();
-
-    if (code_units > m_remaining_code_units) {
-        // If there aren't enough code units remaining, skip to the end.
-        m_ptr += m_remaining_code_units;
-        m_remaining_code_units = 0;
-    } else {
-        m_ptr += code_units;
-        m_remaining_code_units -= code_units;
-    }
-
-    return *this;
-}
-
-u32 Utf16CodePointIterator::operator*() const
-{
-    VERIFY(m_remaining_code_units > 0);
-
-    // rfc2781, 2.2 Decoding UTF-16
-    // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
-    //    of W1. Terminate.
-    // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
-    //    is in error and no valid character can be obtained using W1.
-    //    Terminate.
-    // 3) If there is no W2 (that is, the sequence ends with W1), or if W2
-    //    is not between 0xDC00 and 0xDFFF, the sequence is in error.
-    //    Terminate.
-    // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
-    //    bits of W1 as its 10 high-order bits and the 10 low-order bits of
-    //    W2 as its 10 low-order bits.
-    // 5) Add 0x10000 to U' to obtain the character value U. Terminate.
-
-    auto code_unit = *m_ptr;
-
-    if (UnicodeUtils::is_utf16_high_surrogate(code_unit)) {
-        if (m_remaining_code_units > 1) {
-            auto next_code_unit = *(m_ptr + 1);
-
-            if (UnicodeUtils::is_utf16_low_surrogate(next_code_unit))
-                return UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit);
-        }
-
-        return UnicodeUtils::REPLACEMENT_CODE_POINT;
-    }
-
-    if (UnicodeUtils::is_utf16_low_surrogate(code_unit))
-        return UnicodeUtils::REPLACEMENT_CODE_POINT;
-
-    return static_cast<u32>(code_unit);
-}
-
-bool validate_utf16_le(ReadonlyBytes bytes)
-{
-    return simdutf::validate_utf16le(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
-}
-
-bool validate_utf16_be(ReadonlyBytes bytes)
-{
-    return simdutf::validate_utf16be(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
-}
-
 }