From 7a17c654d293c4afaf3086dc94e8cd4bceac48b1 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Tue, 30 Jul 2024 06:19:56 -0400 Subject: [PATCH] AK: Add a method to compute UTF-16 length from a UTF-8 string --- AK/Utf16View.cpp | 9 +++++++++ AK/Utf16View.h | 2 ++ Tests/AK/TestUtf16.cpp | 8 ++++++++ 3 files changed, 19 insertions(+) diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index f09e85cab11..1276a37363c 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -129,6 +129,15 @@ ErrorOr code_point_to_utf16(Utf16Data& string, u32 code_point, Endianness return {}; } +size_t utf16_code_unit_length_from_utf8(StringView string) +{ + // FIXME: The CPU-specific implementations behave differently on null inputs. We treat null views as an empty string. + if (string.is_empty()) + return 0; + + return simdutf::utf16_length_from_utf8(string.characters_without_null_termination(), string.length()); +} + bool Utf16View::is_high_surrogate(u16 code_unit) { return (code_unit >= high_surrogate_min) && (code_unit <= high_surrogate_max); diff --git a/AK/Utf16View.h b/AK/Utf16View.h index e0bd8d79aaf..f3a4fa3e209 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -26,6 +26,8 @@ ErrorOr utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host) ErrorOr utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host); ErrorOr code_point_to_utf16(Utf16Data&, u32, Endianness = Endianness::Host); +size_t utf16_code_unit_length_from_utf8(StringView); + class Utf16View; class Utf16CodePointIterator { diff --git a/Tests/AK/TestUtf16.cpp b/Tests/AK/TestUtf16.cpp index 052ced40c57..01f7fa973e5 100644 --- a/Tests/AK/TestUtf16.cpp +++ b/Tests/AK/TestUtf16.cpp @@ -89,6 +89,14 @@ TEST_CASE(decode_utf16) EXPECT_EQ(i, expected.size()); } +TEST_CASE(utf16_code_unit_length_from_utf8) +{ + EXPECT_EQ(AK::utf16_code_unit_length_from_utf8(""sv), 0uz); + EXPECT_EQ(AK::utf16_code_unit_length_from_utf8("abc"sv), 3uz); + EXPECT_EQ(AK::utf16_code_unit_length_from_utf8("😀"sv), 2uz); + EXPECT_EQ(AK::utf16_code_unit_length_from_utf8("Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv), 39uz); +} + TEST_CASE(null_view) { Utf16View view;