From baddac5155dc40b90fedd464530ba6218a23db9b Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Sat, 26 Jul 2025 10:09:06 -0400 Subject: [PATCH] AK: Implement a method to split a UTF-16 string --- AK/Utf16StringBase.h | 15 ++++++++++++++ AK/Utf16View.cpp | 18 +++++++++++++++++ AK/Utf16View.h | 41 ++++++++++++++++++++++++++++++++++++++ Tests/AK/TestUtf16View.cpp | 41 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 115 insertions(+) diff --git a/AK/Utf16StringBase.h b/AK/Utf16StringBase.h index 3242d725e8d..3b8d9e9aa71 100644 --- a/AK/Utf16StringBase.h +++ b/AK/Utf16StringBase.h @@ -225,6 +225,21 @@ public: [[nodiscard]] ALWAYS_INLINE bool starts_with(Utf16View const& needle) const { return utf16_view().starts_with(needle); } [[nodiscard]] ALWAYS_INLINE bool ends_with(Utf16View const& needle) const { return utf16_view().ends_with(needle); } + [[nodiscard]] ALWAYS_INLINE Vector split_view(char16_t needle, SplitBehavior split_behavior) const { return utf16_view().split_view(needle, split_behavior); } + [[nodiscard]] ALWAYS_INLINE Vector split_view(Utf16View const& needle, SplitBehavior split_behavior) const { return utf16_view().split_view(needle, split_behavior); } + + template + ALWAYS_INLINE void for_each_split_view(char16_t separator, SplitBehavior split_behavior, Callback&& callback) const + { + utf16_view().for_each_split_view(separator, split_behavior, forward(callback)); + } + + template + ALWAYS_INLINE void for_each_split_view(Utf16View const& separator, SplitBehavior split_behavior, Callback&& callback) const + { + utf16_view().for_each_split_view(separator, split_behavior, forward(callback)); + } + // This is primarily interesting to unit tests. [[nodiscard]] constexpr bool has_short_ascii_storage() const { diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index de6ef1d546f..a653cb1dc82 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -254,6 +254,24 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod VERIFY_NOT_REACHED(); } +Vector Utf16View::split_view(char16_t separator, SplitBehavior split_behavior) const +{ + Utf16View seperator_view { &separator, 1 }; + return split_view(seperator_view, split_behavior); +} + +Vector Utf16View::split_view(Utf16View const& separator, SplitBehavior split_behavior) const +{ + Vector parts; + + for_each_split_view(separator, split_behavior, [&](auto const& part) { + parts.append(part); + return IterationDecision::Continue; + }); + + return parts; +} + size_t Utf16View::calculate_length_in_code_points() const { ASSERT(!has_ascii_storage()); diff --git a/AK/Utf16View.h b/AK/Utf16View.h index df33da86d12..5a5c2d8852b 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -515,6 +516,46 @@ public: return substring_view(length_in_code_units() - needle_length, needle_length) == needle; } + [[nodiscard]] Vector split_view(char16_t, SplitBehavior) const; + [[nodiscard]] Vector split_view(Utf16View const&, SplitBehavior) const; + + template + constexpr void for_each_split_view(char16_t separator, SplitBehavior split_behavior, Callback&& callback) const + { + Utf16View seperator_view { &separator, 1 }; + for_each_split_view(seperator_view, split_behavior, forward(callback)); + } + + template + constexpr void for_each_split_view(Utf16View const& separator, SplitBehavior split_behavior, Callback&& callback) const + { + VERIFY(!separator.is_empty()); + + if (is_empty()) + return; + + bool keep_empty = has_flag(split_behavior, SplitBehavior::KeepEmpty); + bool keep_separator = has_flag(split_behavior, SplitBehavior::KeepTrailingSeparator); + + auto view { *this }; + + for (auto index = view.find_code_unit_offset(separator); index.has_value(); index = view.find_code_unit_offset(separator)) { + if (keep_empty || *index > 0) { + auto part = keep_separator + ? view.substring_view(0, *index + separator.length_in_code_units()) + : view.substring_view(0, *index); + + if (callback(part) == IterationDecision::Break) + return; + } + + view = view.substring_view(*index + separator.length_in_code_units()); + } + + if (keep_empty || !view.is_empty()) + callback(view); + } + // https://infra.spec.whatwg.org/#code-unit-less-than [[nodiscard]] constexpr bool is_code_unit_less_than(Utf16View const& other) const { diff --git a/Tests/AK/TestUtf16View.cpp b/Tests/AK/TestUtf16View.cpp index 8c17e4f5961..0721192b0eb 100644 --- a/Tests/AK/TestUtf16View.cpp +++ b/Tests/AK/TestUtf16View.cpp @@ -634,6 +634,47 @@ TEST_CASE(ends_with) EXPECT(!emoji.ends_with(u"😀"sv)); } +TEST_CASE(split_view) +{ + { + auto test = u"axxbxcxd"sv; + + EXPECT_EQ(test.split_view('x', SplitBehavior::Nothing), Vector({ u"a"sv, u"b"sv, u"c"sv, u"d"sv })); + EXPECT_EQ(test.split_view("x"sv, SplitBehavior::Nothing), Vector({ u"a"sv, u"b"sv, u"c"sv, u"d"sv })); + + EXPECT_EQ(test.split_view('x', SplitBehavior::KeepEmpty), Vector({ u"a"sv, u""sv, u"b"sv, u"c"sv, u"d"sv })); + EXPECT_EQ(test.split_view("x"sv, SplitBehavior::KeepEmpty), Vector({ u"a"sv, u""sv, u"b"sv, u"c"sv, u"d"sv })); + } + { + auto test = u"axxbx"sv; + + EXPECT_EQ(test.split_view('x', SplitBehavior::Nothing), Vector({ u"a"sv, u"b"sv })); + EXPECT_EQ(test.split_view("x"sv, SplitBehavior::Nothing), Vector({ u"a"sv, u"b"sv })); + + EXPECT_EQ(test.split_view('x', SplitBehavior::KeepEmpty), Vector({ u"a"sv, u""sv, u"b"sv, u""sv })); + EXPECT_EQ(test.split_view("x"sv, SplitBehavior::KeepEmpty), Vector({ u"a"sv, u""sv, u"b"sv, u""sv })); + } + { + auto test = u"axxbcxxdxx"sv; + EXPECT_EQ(test.split_view(u"xx"sv, SplitBehavior::Nothing), Vector({ u"a"sv, u"bc"sv, u"d"sv })); + EXPECT_EQ(test.split_view(u"xx"sv, SplitBehavior::KeepEmpty), Vector({ u"a"sv, u"bc"sv, u"d"sv, u""sv })); + } + { + auto test = u"a,,,b"sv; + EXPECT_EQ(test.split_view(u","sv, SplitBehavior::KeepEmpty), Vector({ u"a"sv, u""sv, u""sv, u"b"sv })); + EXPECT_EQ(test.split_view(u","sv, SplitBehavior::KeepTrailingSeparator), Vector({ u"a,"sv, u"b"sv })); + EXPECT_EQ(test.split_view(u","sv, SplitBehavior::KeepTrailingSeparator | SplitBehavior::KeepEmpty), Vector({ u"a,"sv, u","sv, u","sv, u"b"sv })); + } + { + auto test = u"foo bar baz"sv; + EXPECT_EQ(test.split_view(u" "sv, SplitBehavior::Nothing), Vector({ u"foo"sv, u"bar"sv, u"baz"sv })); + } + { + auto test = u"ωΣ2ωΣω"sv; + EXPECT_EQ(test.split_view(0x03A3u, SplitBehavior::Nothing), Vector({ u"ω"sv, u"2ω"sv, u"ω"sv })); + } +} + TEST_CASE(find_code_unit_offset) { auto conversion_result = Utf16String::from_utf8("😀foo😀bar"sv);