AK: Implement a method to split a UTF-16 string

This commit is contained in:
Timothy Flynn 2025-07-26 10:09:06 -04:00 committed by Andreas Kling
commit baddac5155
Notes: github-actions[bot] 2025-07-28 10:27:13 +00:00
4 changed files with 115 additions and 0 deletions

View file

@ -225,6 +225,21 @@ public:
[[nodiscard]] ALWAYS_INLINE bool starts_with(Utf16View const& needle) const { return utf16_view().starts_with(needle); }
[[nodiscard]] ALWAYS_INLINE bool ends_with(Utf16View const& needle) const { return utf16_view().ends_with(needle); }
[[nodiscard]] ALWAYS_INLINE Vector<Utf16View> split_view(char16_t needle, SplitBehavior split_behavior) const { return utf16_view().split_view(needle, split_behavior); }
[[nodiscard]] ALWAYS_INLINE Vector<Utf16View> split_view(Utf16View const& needle, SplitBehavior split_behavior) const { return utf16_view().split_view(needle, split_behavior); }
template<typename Callback>
ALWAYS_INLINE void for_each_split_view(char16_t separator, SplitBehavior split_behavior, Callback&& callback) const
{
utf16_view().for_each_split_view(separator, split_behavior, forward<Callback>(callback));
}
template<typename Callback>
ALWAYS_INLINE void for_each_split_view(Utf16View const& separator, SplitBehavior split_behavior, Callback&& callback) const
{
utf16_view().for_each_split_view(separator, split_behavior, forward<Callback>(callback));
}
// This is primarily interesting to unit tests.
[[nodiscard]] constexpr bool has_short_ascii_storage() const
{

View file

@ -254,6 +254,24 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
VERIFY_NOT_REACHED();
}
Vector<Utf16View> Utf16View::split_view(char16_t separator, SplitBehavior split_behavior) const
{
Utf16View seperator_view { &separator, 1 };
return split_view(seperator_view, split_behavior);
}
Vector<Utf16View> Utf16View::split_view(Utf16View const& separator, SplitBehavior split_behavior) const
{
Vector<Utf16View> parts;
for_each_split_view(separator, split_behavior, [&](auto const& part) {
parts.append(part);
return IterationDecision::Continue;
});
return parts;
}
size_t Utf16View::calculate_length_in_code_points() const
{
ASSERT(!has_ascii_storage());

View file

@ -10,6 +10,7 @@
#include <AK/Error.h>
#include <AK/Format.h>
#include <AK/Forward.h>
#include <AK/IterationDecision.h>
#include <AK/MemMem.h>
#include <AK/Optional.h>
#include <AK/Span.h>
@ -515,6 +516,46 @@ public:
return substring_view(length_in_code_units() - needle_length, needle_length) == needle;
}
[[nodiscard]] Vector<Utf16View> split_view(char16_t, SplitBehavior) const;
[[nodiscard]] Vector<Utf16View> split_view(Utf16View const&, SplitBehavior) const;
template<typename Callback>
constexpr void for_each_split_view(char16_t separator, SplitBehavior split_behavior, Callback&& callback) const
{
Utf16View seperator_view { &separator, 1 };
for_each_split_view(seperator_view, split_behavior, forward<Callback>(callback));
}
template<typename Callback>
constexpr void for_each_split_view(Utf16View const& separator, SplitBehavior split_behavior, Callback&& callback) const
{
VERIFY(!separator.is_empty());
if (is_empty())
return;
bool keep_empty = has_flag(split_behavior, SplitBehavior::KeepEmpty);
bool keep_separator = has_flag(split_behavior, SplitBehavior::KeepTrailingSeparator);
auto view { *this };
for (auto index = view.find_code_unit_offset(separator); index.has_value(); index = view.find_code_unit_offset(separator)) {
if (keep_empty || *index > 0) {
auto part = keep_separator
? view.substring_view(0, *index + separator.length_in_code_units())
: view.substring_view(0, *index);
if (callback(part) == IterationDecision::Break)
return;
}
view = view.substring_view(*index + separator.length_in_code_units());
}
if (keep_empty || !view.is_empty())
callback(view);
}
// https://infra.spec.whatwg.org/#code-unit-less-than
[[nodiscard]] constexpr bool is_code_unit_less_than(Utf16View const& other) const
{

View file

@ -634,6 +634,47 @@ TEST_CASE(ends_with)
EXPECT(!emoji.ends_with(u"😀"sv));
}
TEST_CASE(split_view)
{
{
auto test = u"axxbxcxd"sv;
EXPECT_EQ(test.split_view('x', SplitBehavior::Nothing), Vector({ u"a"sv, u"b"sv, u"c"sv, u"d"sv }));
EXPECT_EQ(test.split_view("x"sv, SplitBehavior::Nothing), Vector({ u"a"sv, u"b"sv, u"c"sv, u"d"sv }));
EXPECT_EQ(test.split_view('x', SplitBehavior::KeepEmpty), Vector({ u"a"sv, u""sv, u"b"sv, u"c"sv, u"d"sv }));
EXPECT_EQ(test.split_view("x"sv, SplitBehavior::KeepEmpty), Vector({ u"a"sv, u""sv, u"b"sv, u"c"sv, u"d"sv }));
}
{
auto test = u"axxbx"sv;
EXPECT_EQ(test.split_view('x', SplitBehavior::Nothing), Vector({ u"a"sv, u"b"sv }));
EXPECT_EQ(test.split_view("x"sv, SplitBehavior::Nothing), Vector({ u"a"sv, u"b"sv }));
EXPECT_EQ(test.split_view('x', SplitBehavior::KeepEmpty), Vector({ u"a"sv, u""sv, u"b"sv, u""sv }));
EXPECT_EQ(test.split_view("x"sv, SplitBehavior::KeepEmpty), Vector({ u"a"sv, u""sv, u"b"sv, u""sv }));
}
{
auto test = u"axxbcxxdxx"sv;
EXPECT_EQ(test.split_view(u"xx"sv, SplitBehavior::Nothing), Vector({ u"a"sv, u"bc"sv, u"d"sv }));
EXPECT_EQ(test.split_view(u"xx"sv, SplitBehavior::KeepEmpty), Vector({ u"a"sv, u"bc"sv, u"d"sv, u""sv }));
}
{
auto test = u"a,,,b"sv;
EXPECT_EQ(test.split_view(u","sv, SplitBehavior::KeepEmpty), Vector({ u"a"sv, u""sv, u""sv, u"b"sv }));
EXPECT_EQ(test.split_view(u","sv, SplitBehavior::KeepTrailingSeparator), Vector({ u"a,"sv, u"b"sv }));
EXPECT_EQ(test.split_view(u","sv, SplitBehavior::KeepTrailingSeparator | SplitBehavior::KeepEmpty), Vector({ u"a,"sv, u","sv, u","sv, u"b"sv }));
}
{
auto test = u"foo bar baz"sv;
EXPECT_EQ(test.split_view(u" "sv, SplitBehavior::Nothing), Vector({ u"foo"sv, u"bar"sv, u"baz"sv }));
}
{
auto test = u"ωΣ2ωΣω"sv;
EXPECT_EQ(test.split_view(0x03A3u, SplitBehavior::Nothing), Vector({ u"ω"sv, u""sv, u"ω"sv }));
}
}
TEST_CASE(find_code_unit_offset)
{
auto conversion_result = Utf16String::from_utf8("😀foo😀bar"sv);