From 32ffe9bbfca5e5409eb67c63f8e4107b0f238c60 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Tue, 16 Jul 2024 13:02:51 -0400 Subject: [PATCH] AK: Replace UTF-16 validation and length computation with simdutf --- AK/Utf16View.cpp | 34 +++++++++++++++++++--------------- AK/Utf16View.h | 8 ++------ 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index 4150f57ed7e..2b3a534e24d 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -1,9 +1,11 @@ /* - * Copyright (c) 2021-2023, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ +#define AK_DONT_REPLACE_STD + #include #include #include @@ -12,6 +14,8 @@ #include #include +#include + namespace AK { static constexpr u16 high_surrogate_min = 0xd800; @@ -233,27 +237,27 @@ bool Utf16View::starts_with(Utf16View const& needle) const return true; } +bool Utf16View::validate() const +{ + return simdutf::validate_utf16(reinterpret_cast(m_code_units.data()), m_code_units.size()); +} + bool Utf16View::validate(size_t& valid_code_units) const { - valid_code_units = 0; + auto result = simdutf::validate_utf16_with_errors(reinterpret_cast(m_code_units.data()), m_code_units.size()); + valid_code_units = result.count; - for (auto const* ptr = begin_ptr(); ptr < end_ptr(); ++ptr) { - if (is_high_surrogate(*ptr)) { - if ((++ptr >= end_ptr()) || !is_low_surrogate(*ptr)) - return false; - ++valid_code_units; - } else if (is_low_surrogate(*ptr)) { - return false; - } - - ++valid_code_units; - } - - return true; + return result.error == simdutf::SUCCESS; } size_t Utf16View::calculate_length_in_code_points() const { + // FIXME: simdutf's code point length method assumes valid UTF-16, whereas Utf16View uses U+FFFD as a replacement + // for invalid code points. If we change Utf16View to only accept valid encodings as an invariant, we can + // remove this branch. + if (validate()) [[likely]] + return simdutf::count_utf16(reinterpret_cast(m_code_units.data()), m_code_units.size()); + size_t code_points = 0; for ([[maybe_unused]] auto code_point : *this) ++code_points; diff --git a/AK/Utf16View.h b/AK/Utf16View.h index 678d55e5d9f..8ed9b825824 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -113,12 +113,8 @@ public: bool starts_with(Utf16View const&) const; + bool validate() const; bool validate(size_t& valid_code_units) const; - bool validate() const - { - size_t valid_code_units; - return validate(valid_code_units); - } bool equals_ignoring_case(Utf16View const&) const;