From 2dc0a3b3ceacae4f18d6af5bd96f9f0e3f4b7a6a Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Tue, 5 Aug 2025 07:08:05 -0400 Subject: [PATCH] AK: Add trim methods to Utf16String that skip allocation when not needed If the string does not begin with any of the provided code units, we do not need to create a new string. --- AK/Utf16String.h | 23 +++++++++ AK/Utf16View.h | 3 +- Libraries/LibWeb/HTML/HTMLInputElement.cpp | 3 +- Libraries/LibWeb/Infra/Strings.cpp | 2 +- Tests/AK/TestUtf16String.cpp | 58 ++++++++++++++++++++++ 5 files changed, 84 insertions(+), 5 deletions(-) diff --git a/AK/Utf16String.h b/AK/Utf16String.h index 089567d1908..45c8a5de5c3 100644 --- a/AK/Utf16String.h +++ b/AK/Utf16String.h @@ -199,6 +199,29 @@ public: return view.replace(needle, replacement, replace_mode); } + ALWAYS_INLINE Utf16String trim(Utf16View const& code_units, TrimMode mode = TrimMode::Both) const + { + if (is_empty()) + return {}; + + bool needs_trimming = false; + + if (mode == TrimMode::Left || mode == TrimMode::Both) + needs_trimming |= code_units.contains(code_unit_at(0)); + if (mode == TrimMode::Right || mode == TrimMode::Both) + needs_trimming |= code_units.contains(code_unit_at(length_in_code_units() - 1)); + + if (!needs_trimming) + return *this; + + return Utf16String::from_utf16_without_validation(utf16_view().trim(code_units, mode)); + } + + ALWAYS_INLINE Utf16String trim_ascii_whitespace(TrimMode mode = TrimMode::Both) const + { + return trim(" \n\t\v\f\r"sv, mode); + } + ALWAYS_INLINE Utf16String escape_html_entities() const { return utf16_view().escape_html_entities(); } private: diff --git a/AK/Utf16View.h b/AK/Utf16View.h index c59cc536cf3..c459f9358b9 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -477,8 +477,7 @@ public: [[nodiscard]] constexpr Utf16View trim_ascii_whitespace(TrimMode mode = TrimMode::Both) const { - static constexpr Utf16View white_space { u" \n\t\v\f\r", 6uz }; - return trim(white_space, mode); + return trim(" \n\t\v\f\r"sv, mode); } constexpr Optional find_code_unit_offset(char16_t needle, size_t start_offset = 0) const diff --git a/Libraries/LibWeb/HTML/HTMLInputElement.cpp b/Libraries/LibWeb/HTML/HTMLInputElement.cpp index 44c752b4a81..ba7534bf644 100644 --- a/Libraries/LibWeb/HTML/HTMLInputElement.cpp +++ b/Libraries/LibWeb/HTML/HTMLInputElement.cpp @@ -1644,8 +1644,7 @@ Utf16String HTMLInputElement::value_sanitization_algorithm(Utf16String const& va }; auto strip_newlines_and_trim = [&]() { - auto value_without_newlines = strip_newlines(); - return Utf16String::from_utf16_without_validation(value_without_newlines.utf16_view().trim(Infra::ASCII_WHITESPACE)); + return strip_newlines().trim(Infra::ASCII_WHITESPACE); }; // https://html.spec.whatwg.org/multipage/input.html#text-(type=text)-state-and-search-state-(type=search):value-sanitization-algorithm diff --git a/Libraries/LibWeb/Infra/Strings.cpp b/Libraries/LibWeb/Infra/Strings.cpp index bd601359573..9ff26f1d47c 100644 --- a/Libraries/LibWeb/Infra/Strings.cpp +++ b/Libraries/LibWeb/Infra/Strings.cpp @@ -105,7 +105,7 @@ Utf16String strip_and_collapse_whitespace(Utf16String const& string) } // ...and then remove any leading and trailing ASCII whitespace from that string. - return Utf16String::from_utf16(builder.utf16_string_view().trim(Infra::ASCII_WHITESPACE)); + return builder.to_utf16_string().trim(Infra::ASCII_WHITESPACE); } // https://infra.spec.whatwg.org/#code-unit-prefix diff --git a/Tests/AK/TestUtf16String.cpp b/Tests/AK/TestUtf16String.cpp index 1fee98ae332..c7a0762ffc4 100644 --- a/Tests/AK/TestUtf16String.cpp +++ b/Tests/AK/TestUtf16String.cpp @@ -913,6 +913,64 @@ TEST_CASE(to_casefold) EXPECT_EQ(result, u"\u03B1\u0342\u03B9"sv); } +TEST_CASE(trim) +{ + auto expect_same_string = [](Utf16String const& string, Utf16String const& result) { + EXPECT_EQ(string, result); + + VERIFY(string.has_ascii_storage() == result.has_ascii_storage()); + auto string_view = string.utf16_view(); + auto result_view = result.utf16_view(); + + if (string.has_ascii_storage()) + EXPECT_EQ(string_view.ascii_span().data(), result_view.ascii_span().data()); + else + EXPECT_EQ(string_view.utf16_span().data(), result_view.utf16_span().data()); + }; + + Utf16View whitespace { u" "sv }; + { + auto string = u"looooong word"_utf16; + expect_same_string(string, string.trim(whitespace, TrimMode::Both)); + expect_same_string(string, string.trim(whitespace, TrimMode::Left)); + expect_same_string(string, string.trim(whitespace, TrimMode::Right)); + } + { + auto string = u" looooong word"_utf16; + EXPECT_EQ(string.trim(whitespace, TrimMode::Both), u"looooong word"sv); + EXPECT_EQ(string.trim(whitespace, TrimMode::Left), u"looooong word"sv); + expect_same_string(string, string.trim(whitespace, TrimMode::Right)); + } + { + auto string = u"looooong word "_utf16; + EXPECT_EQ(string.trim(whitespace, TrimMode::Both), u"looooong word"sv); + expect_same_string(string, string.trim(whitespace, TrimMode::Left)); + EXPECT_EQ(string.trim(whitespace, TrimMode::Right), u"looooong word"sv); + } + { + auto string = u" looooong word "_utf16; + EXPECT_EQ(string.trim(whitespace, TrimMode::Both), u"looooong word"sv); + EXPECT_EQ(string.trim(whitespace, TrimMode::Left), u"looooong word "sv); + EXPECT_EQ(string.trim(whitespace, TrimMode::Right), u" looooong word"sv); + } + { + auto string = u" \u180E "_utf16; + EXPECT_EQ(string.trim(whitespace, TrimMode::Both), u"\u180E"sv); + EXPECT_EQ(string.trim(whitespace, TrimMode::Left), u"\u180E "sv); + EXPECT_EQ(string.trim(whitespace, TrimMode::Right), u" \u180E"sv); + } + { + auto string = u"😀wfh😀"_utf16; + EXPECT_EQ(string.trim(u"😀"sv, TrimMode::Both), u"wfh"sv); + EXPECT_EQ(string.trim(u"😀"sv, TrimMode::Left), u"wfh😀"sv); + EXPECT_EQ(string.trim(u"😀"sv, TrimMode::Right), u"😀wfh"sv); + + expect_same_string(string, string.trim(whitespace, TrimMode::Both)); + expect_same_string(string, string.trim(whitespace, TrimMode::Left)); + expect_same_string(string, string.trim(whitespace, TrimMode::Right)); + } +} + TEST_CASE(copy_operations) { auto test = [](Utf16String const& string1) {