From 6e0290ecaa5a1ccaf01534fb4f54b64247a533aa Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Fri, 27 Jun 2025 12:30:25 -0400 Subject: [PATCH] AK: Define some UTF-16 helper methods * contains * escape_html_entities * replace * to_ascii_lowercase * to_ascii_uppercase * to_ascii_titlecase * trim * trim_whitespace --- AK/Utf16FlyString.h | 35 ++++++++++ AK/Utf16String.h | 46 +++++++++++++ AK/Utf16StringBase.h | 3 + AK/Utf16View.cpp | 84 ++++++++++++++++++++++++ AK/Utf16View.h | 49 +++++++++++++- Tests/AK/TestUtf16View.cpp | 129 +++++++++++++++++++++++++++++++++++++ 6 files changed, 345 insertions(+), 1 deletion(-) diff --git a/AK/Utf16FlyString.h b/AK/Utf16FlyString.h index 15a1c74d63c..da3881bca4c 100644 --- a/AK/Utf16FlyString.h +++ b/AK/Utf16FlyString.h @@ -43,6 +43,41 @@ public: return Utf16String { move(copy) }; } + ALWAYS_INLINE Utf16FlyString to_ascii_lowercase() const + { + auto view = m_data.utf16_view(); + + if (view.has_ascii_storage()) { + if (!any_of(view.ascii_span(), is_ascii_upper_alpha)) + return *this; + } else { + if (!any_of(view.utf16_span(), is_ascii_upper_alpha)) + return *this; + } + + return view.to_ascii_lowercase(); + } + + ALWAYS_INLINE Utf16FlyString to_ascii_uppercase() const + { + auto view = m_data.utf16_view(); + + if (view.has_ascii_storage()) { + if (!any_of(view.ascii_span(), is_ascii_lower_alpha)) + return *this; + } else { + if (!any_of(view.utf16_span(), is_ascii_lower_alpha)) + return *this; + } + + return view.to_ascii_uppercase(); + } + + ALWAYS_INLINE Utf16FlyString to_ascii_titlecase() const + { + return view().to_ascii_titlecase(); + } + ALWAYS_INLINE Utf16FlyString& operator=(Utf16String const& string) { *this = Utf16FlyString { string }; diff --git a/AK/Utf16String.h b/AK/Utf16String.h index 040165d667e..43277168328 100644 --- a/AK/Utf16String.h +++ b/AK/Utf16String.h @@ -118,6 +118,52 @@ public: return from_string_builder_without_validation(builder); } + ALWAYS_INLINE Utf16String to_ascii_lowercase() const + { + auto view = utf16_view(); + + if (view.has_ascii_storage()) { + if (!any_of(view.ascii_span(), is_ascii_upper_alpha)) + return *this; + } else { + if (!any_of(view.utf16_span(), is_ascii_upper_alpha)) + return *this; + } + + return view.to_ascii_lowercase(); + } + + ALWAYS_INLINE Utf16String to_ascii_uppercase() const + { + auto view = utf16_view(); + + if (view.has_ascii_storage()) { + if (!any_of(view.ascii_span(), is_ascii_lower_alpha)) + return *this; + } else { + if (!any_of(view.utf16_span(), is_ascii_lower_alpha)) + return *this; + } + + return view.to_ascii_uppercase(); + } + + ALWAYS_INLINE Utf16String to_ascii_titlecase() const + { + return utf16_view().to_ascii_titlecase(); + } + + ALWAYS_INLINE Utf16String replace(Utf16View const& needle, Utf16View const& replacement, ReplaceMode replace_mode) const + { + auto view = utf16_view(); + if (view.is_empty() || !view.contains(needle)) + return *this; + + return view.replace(needle, replacement, replace_mode); + } + + ALWAYS_INLINE Utf16String escape_html_entities() const { return utf16_view().escape_html_entities(); } + private: ALWAYS_INLINE explicit Utf16String(NonnullRefPtr value) : Utf16StringBase(move(value)) diff --git a/AK/Utf16StringBase.h b/AK/Utf16StringBase.h index 6f33788b3e9..33d07ce7f5f 100644 --- a/AK/Utf16StringBase.h +++ b/AK/Utf16StringBase.h @@ -210,6 +210,9 @@ public: return utf16_view().find_code_unit_offset_ignoring_case(needle, start_offset); } + [[nodiscard]] ALWAYS_INLINE bool contains(char16_t needle) const { return find_code_unit_offset(needle).has_value(); } + [[nodiscard]] ALWAYS_INLINE bool contains(Utf16View const& needle) const { return find_code_unit_offset(needle).has_value(); } + [[nodiscard]] ALWAYS_INLINE bool starts_with(Utf16View const& needle) const { return utf16_view().starts_with(needle); } // This is primarily interesting to unit tests. diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index f797acb2bdf..ceb6e9c2cd4 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -129,6 +130,89 @@ ErrorOr Utf16View::to_byte_string(AllowLonelySurrogates allow_lonely return TRY(to_utf8(allow_lonely_surrogates)).to_byte_string(); } +Utf16String Utf16View::to_ascii_lowercase() const +{ + StringBuilder builder(StringBuilder::Mode::UTF16, length_in_code_units()); + + for (size_t i = 0; i < length_in_code_units(); ++i) + builder.append_code_unit(AK::to_ascii_lowercase(code_unit_at(i))); + + return builder.to_utf16_string(); +} + +Utf16String Utf16View::to_ascii_uppercase() const +{ + StringBuilder builder(StringBuilder::Mode::UTF16, length_in_code_units()); + + for (size_t i = 0; i < length_in_code_units(); ++i) + builder.append_code_unit(AK::to_ascii_uppercase(code_unit_at(i))); + + return builder.to_utf16_string(); +} + +Utf16String Utf16View::to_ascii_titlecase() const +{ + StringBuilder builder(StringBuilder::Mode::UTF16, length_in_code_units()); + bool next_is_upper = true; + + for (size_t i = 0; i < length_in_code_units(); ++i) { + auto code_unit = code_unit_at(i); + + if (next_is_upper) + builder.append_code_unit(AK::to_ascii_uppercase(code_unit)); + else + builder.append_code_unit(AK::to_ascii_lowercase(code_unit)); + + next_is_upper = code_unit == u' '; + } + + return builder.to_utf16_string(); +} + +Utf16String Utf16View::replace(Utf16View const& needle, Utf16View const& replacement, ReplaceMode replace_mode) const +{ + if (is_empty()) + return {}; + + StringBuilder builder(StringBuilder::Mode::UTF16, length_in_code_units()); + auto remaining = *this; + + do { + auto index = remaining.find_code_unit_offset(needle); + if (!index.has_value()) + break; + + builder.append(remaining.substring_view(0, *index)); + builder.append(replacement); + + remaining = remaining.substring_view(*index + needle.length_in_code_units()); + index = remaining.find_code_unit_offset(needle); + } while (replace_mode == ReplaceMode::All && !remaining.is_empty()); + + builder.append(remaining); + return builder.to_utf16_string(); +} + +Utf16String Utf16View::escape_html_entities() const +{ + StringBuilder builder(StringBuilder::Mode::UTF16, length_in_code_units()); + + for (auto code_point : *this) { + if (code_point == '<') + builder.append(u"<"sv); + else if (code_point == '>') + builder.append(u">"sv); + else if (code_point == '&') + builder.append(u"&"sv); + else if (code_point == '"') + builder.append(u"""sv); + else + builder.append_code_point(code_point); + } + + return builder.to_utf16_string(); +} + bool Utf16View::is_ascii() const { if (has_ascii_storage()) diff --git a/AK/Utf16View.h b/AK/Utf16View.h index 2818cb92cef..730611b1f97 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -186,6 +186,10 @@ public: return MUST(to_utf8(allow_lonely_surrogates)); } + Utf16String to_ascii_lowercase() const; + Utf16String to_ascii_uppercase() const; + Utf16String to_ascii_titlecase() const; + [[nodiscard]] ALWAYS_INLINE bool has_ascii_storage() const { return m_length_in_code_units >> Detail::UTF16_FLAG == 0; } [[nodiscard]] constexpr ReadonlyBytes bytes() const @@ -243,7 +247,7 @@ public: return false; for (size_t i = 0; i < length_in_code_units(); ++i) { - if (to_ascii_lowercase(code_unit_at(i)) != to_ascii_lowercase(other.code_unit_at(i))) + if (AK::to_ascii_lowercase(code_unit_at(i)) != AK::to_ascii_lowercase(other.code_unit_at(i))) return false; } @@ -356,6 +360,9 @@ public: return { m_string.utf16 + length_in_code_units(), 0 }; } + Utf16String replace(Utf16View const& needle, Utf16View const& replacement, ReplaceMode) const; + Utf16String escape_html_entities() const; + [[nodiscard]] constexpr Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const { VERIFY(code_unit_offset + code_unit_length <= length_in_code_units()); @@ -370,6 +377,43 @@ public: [[nodiscard]] Utf16View unicode_substring_view(size_t code_point_offset, size_t code_point_length) const; [[nodiscard]] Utf16View unicode_substring_view(size_t code_point_offset) const { return unicode_substring_view(code_point_offset, length_in_code_points() - code_point_offset); } + [[nodiscard]] constexpr Utf16View trim(Utf16View const& code_units, TrimMode mode = TrimMode::Both) const + { + size_t substring_start = 0; + size_t substring_length = length_in_code_units(); + + if (mode == TrimMode::Left || mode == TrimMode::Both) { + for (size_t i = 0; i < length_in_code_units(); ++i) { + if (substring_length == 0) + return {}; + if (!code_units.contains(code_unit_at(i))) + break; + + ++substring_start; + --substring_length; + } + } + + if (mode == TrimMode::Right || mode == TrimMode::Both) { + for (size_t i = length_in_code_units(); i > 0; --i) { + if (substring_length == 0) + return {}; + if (!code_units.contains(code_unit_at(i - 1))) + break; + + --substring_length; + } + } + + return substring_view(substring_start, substring_length); + } + + [[nodiscard]] constexpr Utf16View trim_whitespace(TrimMode mode = TrimMode::Both) const + { + static constexpr Utf16View white_space { u" \n\t\v\f\r", 6uz }; + return trim(white_space, mode); + } + constexpr Optional find_code_unit_offset(char16_t needle, size_t start_offset = 0) const { if (start_offset >= length_in_code_units()) @@ -434,6 +478,9 @@ public: return {}; } + [[nodiscard]] constexpr bool contains(char16_t needle) const { return find_code_unit_offset(needle).has_value(); } + [[nodiscard]] constexpr bool contains(Utf16View const& needle) const { return find_code_unit_offset(needle).has_value(); } + [[nodiscard]] constexpr bool starts_with(Utf16View const& needle) const { if (needle.is_empty()) diff --git a/Tests/AK/TestUtf16View.cpp b/Tests/AK/TestUtf16View.cpp index 818ef7b197e..ef7086e71c2 100644 --- a/Tests/AK/TestUtf16View.cpp +++ b/Tests/AK/TestUtf16View.cpp @@ -10,6 +10,7 @@ #include #include #include +#include #include TEST_CASE(decode_ascii) @@ -340,6 +341,34 @@ TEST_CASE(is_ascii) EXPECT(!u"The quick (β€œbrown”) fox can’t jump 32.3 feet, right?"sv.is_ascii()); } +TEST_CASE(to_ascii_lowercase) +{ + EXPECT_EQ(u""sv.to_ascii_lowercase(), u""sv); + EXPECT_EQ(u"foobar"sv.to_ascii_lowercase(), u"foobar"sv); + EXPECT_EQ(u"FooBar"sv.to_ascii_lowercase(), u"foobar"sv); + EXPECT_EQ(u"FOOBAR"sv.to_ascii_lowercase(), u"foobar"sv); + EXPECT_EQ(u"FOO πŸ˜€ BAR"sv.to_ascii_lowercase(), u"foo πŸ˜€ bar"sv); +} + +TEST_CASE(to_ascii_uppercase) +{ + EXPECT_EQ(u""sv.to_ascii_uppercase(), u""sv); + EXPECT_EQ(u"foobar"sv.to_ascii_uppercase(), u"FOOBAR"sv); + EXPECT_EQ(u"FooBar"sv.to_ascii_uppercase(), u"FOOBAR"sv); + EXPECT_EQ(u"FOOBAR"sv.to_ascii_uppercase(), u"FOOBAR"sv); + EXPECT_EQ(u"foo πŸ˜€ bar"sv.to_ascii_uppercase(), u"FOO πŸ˜€ BAR"sv); +} + +TEST_CASE(to_ascii_titlecase) +{ + EXPECT_EQ(u""sv.to_ascii_titlecase(), u""sv); + EXPECT_EQ(u"foobar"sv.to_ascii_titlecase(), u"Foobar"sv); + EXPECT_EQ(u"FooBar"sv.to_ascii_titlecase(), u"Foobar"sv); + EXPECT_EQ(u"foo bar"sv.to_ascii_titlecase(), u"Foo Bar"sv); + EXPECT_EQ(u"FOO BAR"sv.to_ascii_titlecase(), u"Foo Bar"sv); + EXPECT_EQ(u"foo πŸ˜€ bar"sv.to_ascii_titlecase(), u"Foo πŸ˜€ Bar"sv); +} + TEST_CASE(equals_ignoring_case) { auto string1 = MUST(AK::utf8_to_utf16("foobar"sv)); @@ -355,6 +384,45 @@ TEST_CASE(equals_ignoring_case) EXPECT(!Utf16View { string1 }.equals_ignoring_case(Utf16View { string2 })); } +TEST_CASE(replace) +{ + auto result = u""sv.replace({}, {}, ReplaceMode::FirstOnly); + EXPECT_EQ(result, u""sv); + + result = u""sv.replace(u"foo"sv, u"bar"sv, ReplaceMode::FirstOnly); + EXPECT_EQ(result, u""sv); + + result = u"foo"sv.replace(u"bar"sv, u"baz"sv, ReplaceMode::FirstOnly); + EXPECT_EQ(result, u"foo"sv); + + result = u"foo"sv.replace(u"foo"sv, u"bar"sv, ReplaceMode::FirstOnly); + EXPECT_EQ(result, u"bar"sv); + + result = u"foo"sv.replace(u"o"sv, u"e"sv, ReplaceMode::FirstOnly); + EXPECT_EQ(result, u"feo"sv); + + result = u"foo"sv.replace(u"o"sv, u"e"sv, ReplaceMode::All); + EXPECT_EQ(result, u"fee"sv); + + result = u"foo boo"sv.replace(u"o"sv, u"e"sv, ReplaceMode::FirstOnly); + EXPECT_EQ(result, u"feo boo"sv); + + result = u"foo boo"sv.replace(u"o"sv, u"e"sv, ReplaceMode::All); + EXPECT_EQ(result, u"fee bee"sv); + + result = u"foo πŸ˜€ boo πŸ˜€"sv.replace(u"o"sv, u"e"sv, ReplaceMode::All); + EXPECT_EQ(result, u"fee πŸ˜€ bee πŸ˜€"sv); + + result = u"foo πŸ˜€ boo πŸ˜€"sv.replace(u"πŸ˜€"sv, u"πŸ™ƒ"sv, ReplaceMode::FirstOnly); + EXPECT_EQ(result, u"foo πŸ™ƒ boo πŸ˜€"sv); + + result = u"foo πŸ˜€ boo πŸ˜€"sv.replace(u"πŸ˜€"sv, u"πŸ™ƒ"sv, ReplaceMode::All); + EXPECT_EQ(result, u"foo πŸ™ƒ boo πŸ™ƒ"sv); + + result = u"foo πŸ˜€ boo πŸ˜€"sv.replace(u"πŸ˜€ "sv, u"πŸ™ƒ "sv, ReplaceMode::All); + EXPECT_EQ(result, u"foo πŸ™ƒ boo πŸ˜€"sv); +} + TEST_CASE(substring_view) { auto string = MUST(AK::utf8_to_utf16("ΠŸΡ€ΠΈΠ²Π΅Ρ‚ πŸ˜€"sv)); @@ -375,6 +443,67 @@ TEST_CASE(substring_view) } } +TEST_CASE(trim) +{ + Utf16View whitespace { u" "sv }; + { + Utf16View view { u"word"sv }; + EXPECT_EQ(view.trim(whitespace, TrimMode::Both), u"word"sv); + EXPECT_EQ(view.trim(whitespace, TrimMode::Left), u"word"sv); + EXPECT_EQ(view.trim(whitespace, TrimMode::Right), u"word"sv); + } + { + Utf16View view { u" word"sv }; + EXPECT_EQ(view.trim(whitespace, TrimMode::Both), u"word"sv); + EXPECT_EQ(view.trim(whitespace, TrimMode::Left), u"word"sv); + EXPECT_EQ(view.trim(whitespace, TrimMode::Right), u" word"sv); + } + { + Utf16View view { u"word "sv }; + EXPECT_EQ(view.trim(whitespace, TrimMode::Both), u"word"sv); + EXPECT_EQ(view.trim(whitespace, TrimMode::Left), u"word "sv); + EXPECT_EQ(view.trim(whitespace, TrimMode::Right), u"word"sv); + } + { + Utf16View view { u" word "sv }; + EXPECT_EQ(view.trim(whitespace, TrimMode::Both), u"word"sv); + EXPECT_EQ(view.trim(whitespace, TrimMode::Left), u"word "sv); + EXPECT_EQ(view.trim(whitespace, TrimMode::Right), u" word"sv); + } + { + Utf16View view { u" \u180E "sv }; + EXPECT_EQ(view.trim(whitespace, TrimMode::Both), u"\u180E"sv); + EXPECT_EQ(view.trim(whitespace, TrimMode::Left), u"\u180E "sv); + EXPECT_EQ(view.trim(whitespace, TrimMode::Right), u" \u180E"sv); + } + { + Utf16View view { u"πŸ˜€wfhπŸ˜€"sv }; + EXPECT_EQ(view.trim(u"πŸ˜€"sv, TrimMode::Both), u"wfh"sv); + EXPECT_EQ(view.trim(u"πŸ˜€"sv, TrimMode::Left), u"wfhπŸ˜€"sv); + EXPECT_EQ(view.trim(u"πŸ˜€"sv, TrimMode::Right), u"πŸ˜€wfh"sv); + } +} + +TEST_CASE(contains) +{ + EXPECT(!u""sv.contains(u'a')); + EXPECT(u"a"sv.contains(u'a')); + EXPECT(!u"b"sv.contains(u'a')); + EXPECT(u"ab"sv.contains(u'a')); + EXPECT(u"πŸ˜€"sv.contains(u'\xd83d')); + EXPECT(u"πŸ˜€"sv.contains(u'\xde00')); + + EXPECT(u""sv.contains(u""sv)); + EXPECT(!u""sv.contains(u"a"sv)); + EXPECT(u"a"sv.contains(u"a"sv)); + EXPECT(!u"b"sv.contains(u"a"sv)); + EXPECT(u"ab"sv.contains(u"a"sv)); + EXPECT(u"πŸ˜€"sv.contains(u"\xd83d"sv)); + EXPECT(u"πŸ˜€"sv.contains(u"\xde00"sv)); + EXPECT(u"πŸ˜€"sv.contains(u"πŸ˜€"sv)); + EXPECT(u"abπŸ˜€"sv.contains(u"πŸ˜€"sv)); +} + TEST_CASE(starts_with) { EXPECT(Utf16View {}.starts_with(u""sv));