AK: Add trim methods to Utf16String that skip allocation when not needed

If the string does not begin with any of the provided code units, we do
not need to create a new string.
This commit is contained in:
Timothy Flynn 2025-08-05 07:08:05 -04:00 committed by Jelle Raaijmakers
commit 2dc0a3b3ce
Notes: github-actions[bot] 2025-08-05 13:15:06 +00:00
5 changed files with 84 additions and 5 deletions

View file

@ -199,6 +199,29 @@ public:
return view.replace(needle, replacement, replace_mode);
}
ALWAYS_INLINE Utf16String trim(Utf16View const& code_units, TrimMode mode = TrimMode::Both) const
{
if (is_empty())
return {};
bool needs_trimming = false;
if (mode == TrimMode::Left || mode == TrimMode::Both)
needs_trimming |= code_units.contains(code_unit_at(0));
if (mode == TrimMode::Right || mode == TrimMode::Both)
needs_trimming |= code_units.contains(code_unit_at(length_in_code_units() - 1));
if (!needs_trimming)
return *this;
return Utf16String::from_utf16_without_validation(utf16_view().trim(code_units, mode));
}
ALWAYS_INLINE Utf16String trim_ascii_whitespace(TrimMode mode = TrimMode::Both) const
{
return trim(" \n\t\v\f\r"sv, mode);
}
ALWAYS_INLINE Utf16String escape_html_entities() const { return utf16_view().escape_html_entities(); }
private:

View file

@ -477,8 +477,7 @@ public:
[[nodiscard]] constexpr Utf16View trim_ascii_whitespace(TrimMode mode = TrimMode::Both) const
{
static constexpr Utf16View white_space { u" \n\t\v\f\r", 6uz };
return trim(white_space, mode);
return trim(" \n\t\v\f\r"sv, mode);
}
constexpr Optional<size_t> find_code_unit_offset(char16_t needle, size_t start_offset = 0) const

View file

@ -1644,8 +1644,7 @@ Utf16String HTMLInputElement::value_sanitization_algorithm(Utf16String const& va
};
auto strip_newlines_and_trim = [&]() {
auto value_without_newlines = strip_newlines();
return Utf16String::from_utf16_without_validation(value_without_newlines.utf16_view().trim(Infra::ASCII_WHITESPACE));
return strip_newlines().trim(Infra::ASCII_WHITESPACE);
};
// https://html.spec.whatwg.org/multipage/input.html#text-(type=text)-state-and-search-state-(type=search):value-sanitization-algorithm

View file

@ -105,7 +105,7 @@ Utf16String strip_and_collapse_whitespace(Utf16String const& string)
}
// ...and then remove any leading and trailing ASCII whitespace from that string.
return Utf16String::from_utf16(builder.utf16_string_view().trim(Infra::ASCII_WHITESPACE));
return builder.to_utf16_string().trim(Infra::ASCII_WHITESPACE);
}
// https://infra.spec.whatwg.org/#code-unit-prefix

View file

@ -913,6 +913,64 @@ TEST_CASE(to_casefold)
EXPECT_EQ(result, u"\u03B1\u0342\u03B9"sv);
}
TEST_CASE(trim)
{
auto expect_same_string = [](Utf16String const& string, Utf16String const& result) {
EXPECT_EQ(string, result);
VERIFY(string.has_ascii_storage() == result.has_ascii_storage());
auto string_view = string.utf16_view();
auto result_view = result.utf16_view();
if (string.has_ascii_storage())
EXPECT_EQ(string_view.ascii_span().data(), result_view.ascii_span().data());
else
EXPECT_EQ(string_view.utf16_span().data(), result_view.utf16_span().data());
};
Utf16View whitespace { u" "sv };
{
auto string = u"looooong word"_utf16;
expect_same_string(string, string.trim(whitespace, TrimMode::Both));
expect_same_string(string, string.trim(whitespace, TrimMode::Left));
expect_same_string(string, string.trim(whitespace, TrimMode::Right));
}
{
auto string = u" looooong word"_utf16;
EXPECT_EQ(string.trim(whitespace, TrimMode::Both), u"looooong word"sv);
EXPECT_EQ(string.trim(whitespace, TrimMode::Left), u"looooong word"sv);
expect_same_string(string, string.trim(whitespace, TrimMode::Right));
}
{
auto string = u"looooong word "_utf16;
EXPECT_EQ(string.trim(whitespace, TrimMode::Both), u"looooong word"sv);
expect_same_string(string, string.trim(whitespace, TrimMode::Left));
EXPECT_EQ(string.trim(whitespace, TrimMode::Right), u"looooong word"sv);
}
{
auto string = u" looooong word "_utf16;
EXPECT_EQ(string.trim(whitespace, TrimMode::Both), u"looooong word"sv);
EXPECT_EQ(string.trim(whitespace, TrimMode::Left), u"looooong word "sv);
EXPECT_EQ(string.trim(whitespace, TrimMode::Right), u" looooong word"sv);
}
{
auto string = u" \u180E "_utf16;
EXPECT_EQ(string.trim(whitespace, TrimMode::Both), u"\u180E"sv);
EXPECT_EQ(string.trim(whitespace, TrimMode::Left), u"\u180E "sv);
EXPECT_EQ(string.trim(whitespace, TrimMode::Right), u" \u180E"sv);
}
{
auto string = u"😀wfh😀"_utf16;
EXPECT_EQ(string.trim(u"😀"sv, TrimMode::Both), u"wfh"sv);
EXPECT_EQ(string.trim(u"😀"sv, TrimMode::Left), u"wfh😀"sv);
EXPECT_EQ(string.trim(u"😀"sv, TrimMode::Right), u"😀wfh"sv);
expect_same_string(string, string.trim(whitespace, TrimMode::Both));
expect_same_string(string, string.trim(whitespace, TrimMode::Left));
expect_same_string(string, string.trim(whitespace, TrimMode::Right));
}
}
TEST_CASE(copy_operations)
{
auto test = [](Utf16String const& string1) {