AK: Define some UTF-16 helper methods

* contains
* escape_html_entities
* replace
* to_ascii_lowercase
* to_ascii_uppercase
* to_ascii_titlecase
* trim
* trim_whitespace
This commit is contained in:
Timothy Flynn 2025-06-27 12:30:25 -04:00 committed by Tim Flynn
commit 6e0290ecaa
Notes: github-actions[bot] 2025-07-18 16:47:12 +00:00
6 changed files with 345 additions and 1 deletions

View file

@ -43,6 +43,41 @@ public:
return Utf16String { move(copy) }; return Utf16String { move(copy) };
} }
ALWAYS_INLINE Utf16FlyString to_ascii_lowercase() const
{
auto view = m_data.utf16_view();
if (view.has_ascii_storage()) {
if (!any_of(view.ascii_span(), is_ascii_upper_alpha))
return *this;
} else {
if (!any_of(view.utf16_span(), is_ascii_upper_alpha))
return *this;
}
return view.to_ascii_lowercase();
}
ALWAYS_INLINE Utf16FlyString to_ascii_uppercase() const
{
auto view = m_data.utf16_view();
if (view.has_ascii_storage()) {
if (!any_of(view.ascii_span(), is_ascii_lower_alpha))
return *this;
} else {
if (!any_of(view.utf16_span(), is_ascii_lower_alpha))
return *this;
}
return view.to_ascii_uppercase();
}
ALWAYS_INLINE Utf16FlyString to_ascii_titlecase() const
{
return view().to_ascii_titlecase();
}
ALWAYS_INLINE Utf16FlyString& operator=(Utf16String const& string) ALWAYS_INLINE Utf16FlyString& operator=(Utf16String const& string)
{ {
*this = Utf16FlyString { string }; *this = Utf16FlyString { string };

View file

@ -118,6 +118,52 @@ public:
return from_string_builder_without_validation(builder); return from_string_builder_without_validation(builder);
} }
ALWAYS_INLINE Utf16String to_ascii_lowercase() const
{
auto view = utf16_view();
if (view.has_ascii_storage()) {
if (!any_of(view.ascii_span(), is_ascii_upper_alpha))
return *this;
} else {
if (!any_of(view.utf16_span(), is_ascii_upper_alpha))
return *this;
}
return view.to_ascii_lowercase();
}
ALWAYS_INLINE Utf16String to_ascii_uppercase() const
{
auto view = utf16_view();
if (view.has_ascii_storage()) {
if (!any_of(view.ascii_span(), is_ascii_lower_alpha))
return *this;
} else {
if (!any_of(view.utf16_span(), is_ascii_lower_alpha))
return *this;
}
return view.to_ascii_uppercase();
}
ALWAYS_INLINE Utf16String to_ascii_titlecase() const
{
return utf16_view().to_ascii_titlecase();
}
ALWAYS_INLINE Utf16String replace(Utf16View const& needle, Utf16View const& replacement, ReplaceMode replace_mode) const
{
auto view = utf16_view();
if (view.is_empty() || !view.contains(needle))
return *this;
return view.replace(needle, replacement, replace_mode);
}
ALWAYS_INLINE Utf16String escape_html_entities() const { return utf16_view().escape_html_entities(); }
private: private:
ALWAYS_INLINE explicit Utf16String(NonnullRefPtr<Detail::Utf16StringData const> value) ALWAYS_INLINE explicit Utf16String(NonnullRefPtr<Detail::Utf16StringData const> value)
: Utf16StringBase(move(value)) : Utf16StringBase(move(value))

View file

@ -210,6 +210,9 @@ public:
return utf16_view().find_code_unit_offset_ignoring_case(needle, start_offset); return utf16_view().find_code_unit_offset_ignoring_case(needle, start_offset);
} }
[[nodiscard]] ALWAYS_INLINE bool contains(char16_t needle) const { return find_code_unit_offset(needle).has_value(); }
[[nodiscard]] ALWAYS_INLINE bool contains(Utf16View const& needle) const { return find_code_unit_offset(needle).has_value(); }
[[nodiscard]] ALWAYS_INLINE bool starts_with(Utf16View const& needle) const { return utf16_view().starts_with(needle); } [[nodiscard]] ALWAYS_INLINE bool starts_with(Utf16View const& needle) const { return utf16_view().starts_with(needle); }
// This is primarily interesting to unit tests. // This is primarily interesting to unit tests.

View file

@ -8,6 +8,7 @@
#include <AK/Concepts.h> #include <AK/Concepts.h>
#include <AK/StringBuilder.h> #include <AK/StringBuilder.h>
#include <AK/StringView.h> #include <AK/StringView.h>
#include <AK/Utf16String.h>
#include <AK/Utf16View.h> #include <AK/Utf16View.h>
#include <AK/Utf32View.h> #include <AK/Utf32View.h>
#include <AK/Utf8View.h> #include <AK/Utf8View.h>
@ -129,6 +130,89 @@ ErrorOr<ByteString> Utf16View::to_byte_string(AllowLonelySurrogates allow_lonely
return TRY(to_utf8(allow_lonely_surrogates)).to_byte_string(); return TRY(to_utf8(allow_lonely_surrogates)).to_byte_string();
} }
Utf16String Utf16View::to_ascii_lowercase() const
{
StringBuilder builder(StringBuilder::Mode::UTF16, length_in_code_units());
for (size_t i = 0; i < length_in_code_units(); ++i)
builder.append_code_unit(AK::to_ascii_lowercase(code_unit_at(i)));
return builder.to_utf16_string();
}
Utf16String Utf16View::to_ascii_uppercase() const
{
StringBuilder builder(StringBuilder::Mode::UTF16, length_in_code_units());
for (size_t i = 0; i < length_in_code_units(); ++i)
builder.append_code_unit(AK::to_ascii_uppercase(code_unit_at(i)));
return builder.to_utf16_string();
}
Utf16String Utf16View::to_ascii_titlecase() const
{
StringBuilder builder(StringBuilder::Mode::UTF16, length_in_code_units());
bool next_is_upper = true;
for (size_t i = 0; i < length_in_code_units(); ++i) {
auto code_unit = code_unit_at(i);
if (next_is_upper)
builder.append_code_unit(AK::to_ascii_uppercase(code_unit));
else
builder.append_code_unit(AK::to_ascii_lowercase(code_unit));
next_is_upper = code_unit == u' ';
}
return builder.to_utf16_string();
}
Utf16String Utf16View::replace(Utf16View const& needle, Utf16View const& replacement, ReplaceMode replace_mode) const
{
if (is_empty())
return {};
StringBuilder builder(StringBuilder::Mode::UTF16, length_in_code_units());
auto remaining = *this;
do {
auto index = remaining.find_code_unit_offset(needle);
if (!index.has_value())
break;
builder.append(remaining.substring_view(0, *index));
builder.append(replacement);
remaining = remaining.substring_view(*index + needle.length_in_code_units());
index = remaining.find_code_unit_offset(needle);
} while (replace_mode == ReplaceMode::All && !remaining.is_empty());
builder.append(remaining);
return builder.to_utf16_string();
}
Utf16String Utf16View::escape_html_entities() const
{
StringBuilder builder(StringBuilder::Mode::UTF16, length_in_code_units());
for (auto code_point : *this) {
if (code_point == '<')
builder.append(u"&lt;"sv);
else if (code_point == '>')
builder.append(u"&gt;"sv);
else if (code_point == '&')
builder.append(u"&amp;"sv);
else if (code_point == '"')
builder.append(u"&quot;"sv);
else
builder.append_code_point(code_point);
}
return builder.to_utf16_string();
}
bool Utf16View::is_ascii() const bool Utf16View::is_ascii() const
{ {
if (has_ascii_storage()) if (has_ascii_storage())

View file

@ -186,6 +186,10 @@ public:
return MUST(to_utf8(allow_lonely_surrogates)); return MUST(to_utf8(allow_lonely_surrogates));
} }
Utf16String to_ascii_lowercase() const;
Utf16String to_ascii_uppercase() const;
Utf16String to_ascii_titlecase() const;
[[nodiscard]] ALWAYS_INLINE bool has_ascii_storage() const { return m_length_in_code_units >> Detail::UTF16_FLAG == 0; } [[nodiscard]] ALWAYS_INLINE bool has_ascii_storage() const { return m_length_in_code_units >> Detail::UTF16_FLAG == 0; }
[[nodiscard]] constexpr ReadonlyBytes bytes() const [[nodiscard]] constexpr ReadonlyBytes bytes() const
@ -243,7 +247,7 @@ public:
return false; return false;
for (size_t i = 0; i < length_in_code_units(); ++i) { for (size_t i = 0; i < length_in_code_units(); ++i) {
if (to_ascii_lowercase(code_unit_at(i)) != to_ascii_lowercase(other.code_unit_at(i))) if (AK::to_ascii_lowercase(code_unit_at(i)) != AK::to_ascii_lowercase(other.code_unit_at(i)))
return false; return false;
} }
@ -356,6 +360,9 @@ public:
return { m_string.utf16 + length_in_code_units(), 0 }; return { m_string.utf16 + length_in_code_units(), 0 };
} }
Utf16String replace(Utf16View const& needle, Utf16View const& replacement, ReplaceMode) const;
Utf16String escape_html_entities() const;
[[nodiscard]] constexpr Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const [[nodiscard]] constexpr Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const
{ {
VERIFY(code_unit_offset + code_unit_length <= length_in_code_units()); VERIFY(code_unit_offset + code_unit_length <= length_in_code_units());
@ -370,6 +377,43 @@ public:
[[nodiscard]] Utf16View unicode_substring_view(size_t code_point_offset, size_t code_point_length) const; [[nodiscard]] Utf16View unicode_substring_view(size_t code_point_offset, size_t code_point_length) const;
[[nodiscard]] Utf16View unicode_substring_view(size_t code_point_offset) const { return unicode_substring_view(code_point_offset, length_in_code_points() - code_point_offset); } [[nodiscard]] Utf16View unicode_substring_view(size_t code_point_offset) const { return unicode_substring_view(code_point_offset, length_in_code_points() - code_point_offset); }
[[nodiscard]] constexpr Utf16View trim(Utf16View const& code_units, TrimMode mode = TrimMode::Both) const
{
size_t substring_start = 0;
size_t substring_length = length_in_code_units();
if (mode == TrimMode::Left || mode == TrimMode::Both) {
for (size_t i = 0; i < length_in_code_units(); ++i) {
if (substring_length == 0)
return {};
if (!code_units.contains(code_unit_at(i)))
break;
++substring_start;
--substring_length;
}
}
if (mode == TrimMode::Right || mode == TrimMode::Both) {
for (size_t i = length_in_code_units(); i > 0; --i) {
if (substring_length == 0)
return {};
if (!code_units.contains(code_unit_at(i - 1)))
break;
--substring_length;
}
}
return substring_view(substring_start, substring_length);
}
[[nodiscard]] constexpr Utf16View trim_whitespace(TrimMode mode = TrimMode::Both) const
{
static constexpr Utf16View white_space { u" \n\t\v\f\r", 6uz };
return trim(white_space, mode);
}
constexpr Optional<size_t> find_code_unit_offset(char16_t needle, size_t start_offset = 0) const constexpr Optional<size_t> find_code_unit_offset(char16_t needle, size_t start_offset = 0) const
{ {
if (start_offset >= length_in_code_units()) if (start_offset >= length_in_code_units())
@ -434,6 +478,9 @@ public:
return {}; return {};
} }
[[nodiscard]] constexpr bool contains(char16_t needle) const { return find_code_unit_offset(needle).has_value(); }
[[nodiscard]] constexpr bool contains(Utf16View const& needle) const { return find_code_unit_offset(needle).has_value(); }
[[nodiscard]] constexpr bool starts_with(Utf16View const& needle) const [[nodiscard]] constexpr bool starts_with(Utf16View const& needle) const
{ {
if (needle.is_empty()) if (needle.is_empty())

View file

@ -10,6 +10,7 @@
#include <AK/String.h> #include <AK/String.h>
#include <AK/StringView.h> #include <AK/StringView.h>
#include <AK/Types.h> #include <AK/Types.h>
#include <AK/Utf16String.h>
#include <AK/Utf16View.h> #include <AK/Utf16View.h>
TEST_CASE(decode_ascii) TEST_CASE(decode_ascii)
@ -340,6 +341,34 @@ TEST_CASE(is_ascii)
EXPECT(!u"The quick (“brown”) fox cant jump 32.3 feet, right?"sv.is_ascii()); EXPECT(!u"The quick (“brown”) fox cant jump 32.3 feet, right?"sv.is_ascii());
} }
TEST_CASE(to_ascii_lowercase)
{
EXPECT_EQ(u""sv.to_ascii_lowercase(), u""sv);
EXPECT_EQ(u"foobar"sv.to_ascii_lowercase(), u"foobar"sv);
EXPECT_EQ(u"FooBar"sv.to_ascii_lowercase(), u"foobar"sv);
EXPECT_EQ(u"FOOBAR"sv.to_ascii_lowercase(), u"foobar"sv);
EXPECT_EQ(u"FOO 😀 BAR"sv.to_ascii_lowercase(), u"foo 😀 bar"sv);
}
TEST_CASE(to_ascii_uppercase)
{
EXPECT_EQ(u""sv.to_ascii_uppercase(), u""sv);
EXPECT_EQ(u"foobar"sv.to_ascii_uppercase(), u"FOOBAR"sv);
EXPECT_EQ(u"FooBar"sv.to_ascii_uppercase(), u"FOOBAR"sv);
EXPECT_EQ(u"FOOBAR"sv.to_ascii_uppercase(), u"FOOBAR"sv);
EXPECT_EQ(u"foo 😀 bar"sv.to_ascii_uppercase(), u"FOO 😀 BAR"sv);
}
TEST_CASE(to_ascii_titlecase)
{
EXPECT_EQ(u""sv.to_ascii_titlecase(), u""sv);
EXPECT_EQ(u"foobar"sv.to_ascii_titlecase(), u"Foobar"sv);
EXPECT_EQ(u"FooBar"sv.to_ascii_titlecase(), u"Foobar"sv);
EXPECT_EQ(u"foo bar"sv.to_ascii_titlecase(), u"Foo Bar"sv);
EXPECT_EQ(u"FOO BAR"sv.to_ascii_titlecase(), u"Foo Bar"sv);
EXPECT_EQ(u"foo 😀 bar"sv.to_ascii_titlecase(), u"Foo 😀 Bar"sv);
}
TEST_CASE(equals_ignoring_case) TEST_CASE(equals_ignoring_case)
{ {
auto string1 = MUST(AK::utf8_to_utf16("foobar"sv)); auto string1 = MUST(AK::utf8_to_utf16("foobar"sv));
@ -355,6 +384,45 @@ TEST_CASE(equals_ignoring_case)
EXPECT(!Utf16View { string1 }.equals_ignoring_case(Utf16View { string2 })); EXPECT(!Utf16View { string1 }.equals_ignoring_case(Utf16View { string2 }));
} }
TEST_CASE(replace)
{
auto result = u""sv.replace({}, {}, ReplaceMode::FirstOnly);
EXPECT_EQ(result, u""sv);
result = u""sv.replace(u"foo"sv, u"bar"sv, ReplaceMode::FirstOnly);
EXPECT_EQ(result, u""sv);
result = u"foo"sv.replace(u"bar"sv, u"baz"sv, ReplaceMode::FirstOnly);
EXPECT_EQ(result, u"foo"sv);
result = u"foo"sv.replace(u"foo"sv, u"bar"sv, ReplaceMode::FirstOnly);
EXPECT_EQ(result, u"bar"sv);
result = u"foo"sv.replace(u"o"sv, u"e"sv, ReplaceMode::FirstOnly);
EXPECT_EQ(result, u"feo"sv);
result = u"foo"sv.replace(u"o"sv, u"e"sv, ReplaceMode::All);
EXPECT_EQ(result, u"fee"sv);
result = u"foo boo"sv.replace(u"o"sv, u"e"sv, ReplaceMode::FirstOnly);
EXPECT_EQ(result, u"feo boo"sv);
result = u"foo boo"sv.replace(u"o"sv, u"e"sv, ReplaceMode::All);
EXPECT_EQ(result, u"fee bee"sv);
result = u"foo 😀 boo 😀"sv.replace(u"o"sv, u"e"sv, ReplaceMode::All);
EXPECT_EQ(result, u"fee 😀 bee 😀"sv);
result = u"foo 😀 boo 😀"sv.replace(u"😀"sv, u"🙃"sv, ReplaceMode::FirstOnly);
EXPECT_EQ(result, u"foo 🙃 boo 😀"sv);
result = u"foo 😀 boo 😀"sv.replace(u"😀"sv, u"🙃"sv, ReplaceMode::All);
EXPECT_EQ(result, u"foo 🙃 boo 🙃"sv);
result = u"foo 😀 boo 😀"sv.replace(u"😀 "sv, u"🙃 "sv, ReplaceMode::All);
EXPECT_EQ(result, u"foo 🙃 boo 😀"sv);
}
TEST_CASE(substring_view) TEST_CASE(substring_view)
{ {
auto string = MUST(AK::utf8_to_utf16("Привет 😀"sv)); auto string = MUST(AK::utf8_to_utf16("Привет 😀"sv));
@ -375,6 +443,67 @@ TEST_CASE(substring_view)
} }
} }
TEST_CASE(trim)
{
Utf16View whitespace { u" "sv };
{
Utf16View view { u"word"sv };
EXPECT_EQ(view.trim(whitespace, TrimMode::Both), u"word"sv);
EXPECT_EQ(view.trim(whitespace, TrimMode::Left), u"word"sv);
EXPECT_EQ(view.trim(whitespace, TrimMode::Right), u"word"sv);
}
{
Utf16View view { u" word"sv };
EXPECT_EQ(view.trim(whitespace, TrimMode::Both), u"word"sv);
EXPECT_EQ(view.trim(whitespace, TrimMode::Left), u"word"sv);
EXPECT_EQ(view.trim(whitespace, TrimMode::Right), u" word"sv);
}
{
Utf16View view { u"word "sv };
EXPECT_EQ(view.trim(whitespace, TrimMode::Both), u"word"sv);
EXPECT_EQ(view.trim(whitespace, TrimMode::Left), u"word "sv);
EXPECT_EQ(view.trim(whitespace, TrimMode::Right), u"word"sv);
}
{
Utf16View view { u" word "sv };
EXPECT_EQ(view.trim(whitespace, TrimMode::Both), u"word"sv);
EXPECT_EQ(view.trim(whitespace, TrimMode::Left), u"word "sv);
EXPECT_EQ(view.trim(whitespace, TrimMode::Right), u" word"sv);
}
{
Utf16View view { u" \u180E "sv };
EXPECT_EQ(view.trim(whitespace, TrimMode::Both), u"\u180E"sv);
EXPECT_EQ(view.trim(whitespace, TrimMode::Left), u"\u180E "sv);
EXPECT_EQ(view.trim(whitespace, TrimMode::Right), u" \u180E"sv);
}
{
Utf16View view { u"😀wfh😀"sv };
EXPECT_EQ(view.trim(u"😀"sv, TrimMode::Both), u"wfh"sv);
EXPECT_EQ(view.trim(u"😀"sv, TrimMode::Left), u"wfh😀"sv);
EXPECT_EQ(view.trim(u"😀"sv, TrimMode::Right), u"😀wfh"sv);
}
}
TEST_CASE(contains)
{
EXPECT(!u""sv.contains(u'a'));
EXPECT(u"a"sv.contains(u'a'));
EXPECT(!u"b"sv.contains(u'a'));
EXPECT(u"ab"sv.contains(u'a'));
EXPECT(u"😀"sv.contains(u'\xd83d'));
EXPECT(u"😀"sv.contains(u'\xde00'));
EXPECT(u""sv.contains(u""sv));
EXPECT(!u""sv.contains(u"a"sv));
EXPECT(u"a"sv.contains(u"a"sv));
EXPECT(!u"b"sv.contains(u"a"sv));
EXPECT(u"ab"sv.contains(u"a"sv));
EXPECT(u"😀"sv.contains(u"\xd83d"sv));
EXPECT(u"😀"sv.contains(u"\xde00"sv));
EXPECT(u"😀"sv.contains(u"😀"sv));
EXPECT(u"ab😀"sv.contains(u"😀"sv));
}
TEST_CASE(starts_with) TEST_CASE(starts_with)
{ {
EXPECT(Utf16View {}.starts_with(u""sv)); EXPECT(Utf16View {}.starts_with(u""sv));