diff --git a/AK/Utf8View.cpp b/AK/Utf8View.cpp index fb47e00d368..14eec8294d7 100644 --- a/AK/Utf8View.cpp +++ b/AK/Utf8View.cpp @@ -5,11 +5,15 @@ * SPDX-License-Identifier: BSD-2-Clause */ +#define AK_DONT_REPLACE_STD + #include #include #include #include +#include + namespace AK { Utf8CodePointIterator Utf8View::iterator_at_byte_offset(size_t byte_offset) const @@ -72,6 +76,12 @@ Utf8View Utf8View::unicode_substring_view(size_t code_point_offset, size_t code_ size_t Utf8View::calculate_length() const { + // FIXME: simdutf's code point length method assumes valid UTF-8, whereas Utf8View uses U+FFFD as a replacement + // for invalid code points. If we change Utf8View to only accept valid encodings as an invariant, we can + // remove this branch. + if (validate()) [[likely]] + return simdutf::count_utf8(m_string.characters_without_null_termination(), m_string.length()); + size_t length = 0; for (size_t i = 0; i < m_string.length(); ++length) { @@ -143,6 +153,24 @@ Utf8View Utf8View::trim(Utf8View const& characters, TrimMode mode) const return substring_view(substring_start, substring_length); } +bool Utf8View::validate(size_t& valid_bytes, AllowSurrogates allow_surrogates) const +{ + auto result = simdutf::validate_utf8_with_errors(m_string.characters_without_null_termination(), m_string.length()); + valid_bytes = result.count; + + if (result.error == simdutf::SURROGATE && allow_surrogates == AllowSurrogates::Yes) { + valid_bytes += 3; // All surrogates have a UTF-8 byte length of 3. + + size_t substring_valid_bytes = 0; + auto is_valid = substring_view(valid_bytes).validate(substring_valid_bytes, allow_surrogates); + + valid_bytes += substring_valid_bytes; + return is_valid; + } + + return result.error == simdutf::SUCCESS; +} + Utf8CodePointIterator& Utf8CodePointIterator::operator++() { VERIFY(m_length > 0); diff --git a/AK/Utf8View.h b/AK/Utf8View.h index 0c25dce1707..166f94909c5 100644 --- a/AK/Utf8View.h +++ b/AK/Utf8View.h @@ -121,41 +121,13 @@ public: return m_length; } - constexpr bool validate(AllowSurrogates surrogates = AllowSurrogates::Yes) const + bool validate(AllowSurrogates allow_surrogates = AllowSurrogates::Yes) const { size_t valid_bytes = 0; - return validate(valid_bytes, surrogates); + return validate(valid_bytes, allow_surrogates); } - constexpr bool validate(size_t& valid_bytes, AllowSurrogates surrogates = AllowSurrogates::Yes) const - { - valid_bytes = 0; - - for (auto it = m_string.begin(); it != m_string.end(); ++it) { - auto [byte_length, code_point, is_valid] = decode_leading_byte(static_cast(*it)); - if (!is_valid) - return false; - - for (size_t i = 1; i < byte_length; ++i) { - if (++it == m_string.end()) - return false; - - auto [code_point_bits, is_valid] = decode_continuation_byte(static_cast(*it)); - if (!is_valid) - return false; - - code_point <<= 6; - code_point |= code_point_bits; - } - - if (!is_valid_code_point(code_point, byte_length, surrogates)) - return false; - - valid_bytes += byte_length; - } - - return true; - } + bool validate(size_t& valid_bytes, AllowSurrogates allow_surrogates = AllowSurrogates::Yes) const; private: friend class Utf8CodePointIterator; @@ -198,36 +170,6 @@ private: return { .is_valid = false }; } - struct ContinuationByte { - u32 code_point_bits { 0 }; - bool is_valid { false }; - }; - - static constexpr ContinuationByte decode_continuation_byte(u8 byte) - { - constexpr u8 continuation_byte_encoding_bits = 0b1000'0000; - constexpr u8 continuation_byte_encoding_mask = 0b1100'0000; - - if ((byte & continuation_byte_encoding_mask) == continuation_byte_encoding_bits) { - byte &= ~continuation_byte_encoding_mask; - return { byte, true }; - } - - return { .is_valid = false }; - } - - static constexpr bool is_valid_code_point(u32 code_point, size_t byte_length, AllowSurrogates surrogates = AllowSurrogates::Yes) - { - if (surrogates == AllowSurrogates::No && byte_length == 3 && code_point >= 0xD800 && code_point <= 0xDFFF) - return false; - for (auto const& data : utf8_encoded_byte_data) { - if (code_point >= data.first_code_point && code_point <= data.last_code_point) - return byte_length == data.byte_length; - } - - return false; - } - StringView m_string; mutable size_t m_length { 0 }; mutable bool m_have_length { false };