LibRegex: Remove unused Utf8View/Utf32View support in RegexStringView

This commit is contained in:
Andreas Kling 2025-04-15 16:11:13 +02:00
parent 466c793fdb
commit 7029957a6b
2 changed files with 15 additions and 187 deletions

View file

@ -14,7 +14,6 @@
#include <AK/COWVector.h>
#include <AK/FlyString.h>
#include <AK/MemMem.h>
#include <AK/RedBlackTree.h>
#include <AK/StringBuilder.h>
#include <AK/StringView.h>
#include <AK/Utf16View.h>
@ -29,11 +28,6 @@ class RegexStringView {
public:
RegexStringView() = default;
RegexStringView(ByteString const& string)
: m_view(string.view())
{
}
RegexStringView(String const& string)
: m_view(string.bytes_as_string_view())
{
@ -44,22 +38,11 @@ public:
{
}
RegexStringView(Utf32View view)
: m_view(view)
{
}
RegexStringView(Utf16View view)
: m_view(view)
{
}
RegexStringView(Utf8View view)
: m_view(view)
{
}
RegexStringView(ByteString&&) = delete;
RegexStringView(String&&) = delete;
bool is_string_view() const
@ -72,21 +55,11 @@ public:
return m_view.get<StringView>();
}
Utf32View const& u32_view() const
{
return m_view.get<Utf32View>();
}
Utf16View const& u16_view() const
{
return m_view.get<Utf16View>();
}
Utf8View const& u8_view() const
{
return m_view.get<Utf8View>();
}
bool unicode() const { return m_unicode; }
void set_unicode(bool unicode) { m_unicode = unicode; }
@ -115,14 +88,12 @@ public:
{
return m_view.visit(
[](Utf16View const& view) { return view.length_in_code_units(); },
[](Utf8View const& view) { return view.byte_length(); },
[](auto const& view) { return view.length(); });
}
size_t length_of_code_point(u32 code_point) const
{
return m_view.visit(
[](Utf32View const&) { return 1; },
[&](Utf16View const&) {
if (code_point < 0x10000)
return 1;
@ -159,9 +130,6 @@ public:
optional_string_storage = builder.to_byte_string();
return RegexStringView { T { *optional_string_storage } };
},
[&](Utf32View) {
return RegexStringView { Utf32View { data.data(), data.size() } };
},
[&](Utf16View) {
optional_utf16_storage = AK::utf32_to_utf16(Utf32View { data.data(), data.size() }).release_value_but_fixme_should_propagate_errors();
return RegexStringView { Utf16View { optional_utf16_storage } };
@ -181,24 +149,6 @@ public:
new_views.empend(view);
return new_views;
},
[](Utf32View view) {
if (view.is_empty())
return Vector<RegexStringView> { view };
Vector<RegexStringView> views;
u32 newline = '\n';
while (!view.is_empty()) {
auto position = AK::memmem_optional(view.code_points(), view.length() * sizeof(u32), &newline, sizeof(u32));
if (!position.has_value())
break;
auto offset = position.value() / sizeof(u32);
views.empend(view.substring_view(0, offset));
view = view.substring_view(offset + 1, view.length() - offset - 1);
}
if (!view.is_empty())
views.empend(view);
return views;
},
[](Utf16View view) {
if (view.is_empty())
return Vector<RegexStringView> { view };
@ -216,34 +166,6 @@ public:
if (!view.is_empty())
views.empend(view);
return views;
},
[](Utf8View const& view) {
if (view.is_empty())
return Vector<RegexStringView> { view };
Vector<RegexStringView> views;
auto it = view.begin();
auto previous_newline_position_it = it;
for (;;) {
if (*it == '\n') {
auto previous_offset = view.byte_offset_of(previous_newline_position_it);
auto new_offset = view.byte_offset_of(it);
auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
views.empend(slice);
++it;
previous_newline_position_it = it;
}
if (it.done())
break;
++it;
}
if (it != previous_newline_position_it) {
auto previous_offset = view.byte_offset_of(previous_newline_position_it);
auto new_offset = view.byte_offset_of(it);
auto slice = view.substring_view(previous_offset, new_offset - previous_offset);
views.empend(slice);
}
return views;
});
}
@ -252,8 +174,7 @@ public:
if (unicode()) {
auto view = m_view.visit(
[&](auto view) { return RegexStringView { view.substring_view(offset, length) }; },
[&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; },
[&](Utf8View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
[&](Utf16View const& view) { return RegexStringView { view.unicode_substring_view(offset, length) }; });
view.set_unicode(unicode());
return view;
@ -302,13 +223,7 @@ public:
return ch;
}
},
[&](Utf32View const& view) -> u32 { return view[index]; },
[&](Utf16View const& view) -> u32 { return view.code_point_at(index); },
[&](Utf8View const& view) -> u32 {
auto it = view.iterator_at_byte_offset(index);
VERIFY(it != view.end());
return *it;
});
[&](Utf16View const& view) -> u32 { return view.code_point_at(index); });
}
u32 code_unit_at(size_t code_unit_index) const
@ -325,13 +240,7 @@ public:
return ch;
}
},
[&](Utf32View const& view) -> u32 { return view[code_unit_index]; },
[&](Utf16View const& view) -> u32 { return view.code_unit_at(code_unit_index); },
[&](Utf8View const& view) -> u32 {
auto it = view.iterator_at_byte_offset(code_unit_index);
VERIFY(it != view.end());
return *it;
});
[&](Utf16View const& view) -> u32 { return view.code_unit_at(code_unit_index); });
}
size_t code_unit_offset_of(size_t code_point_index) const
@ -341,71 +250,32 @@ public:
Utf8View utf8_view { view };
return utf8_view.byte_offset_of(code_point_index);
},
[&](Utf32View const&) -> u32 { return code_point_index; },
[&](Utf16View const& view) -> u32 {
return view.code_unit_offset_of(code_point_index);
},
[&](Utf8View const& view) -> u32 {
return view.byte_offset_of(code_point_index);
});
}
bool operator==(char const* cstring) const
{
return m_view.visit(
[&](Utf32View) { return to_byte_string() == cstring; },
[&](Utf16View) { return to_byte_string() == cstring; },
[&](Utf8View const& view) { return view.as_string() == cstring; },
[&](StringView view) { return view == cstring; });
}
bool operator==(ByteString const& string) const
{
return m_view.visit(
[&](Utf32View) { return to_byte_string() == string; },
[&](Utf16View) { return to_byte_string() == string; },
[&](Utf8View const& view) { return view.as_string() == string; },
[&](StringView view) { return view == string; });
}
bool operator==(StringView string) const
{
return m_view.visit(
[&](Utf32View) { return to_byte_string() == string; },
[&](Utf16View) { return to_byte_string() == string; },
[&](Utf8View const& view) { return view.as_string() == string; },
[&](StringView view) { return view == string; });
}
bool operator==(Utf32View const& other) const
{
return m_view.visit(
[&](Utf32View view) {
return view.length() == other.length() && __builtin_memcmp(view.code_points(), other.code_points(), view.length() * sizeof(u32)) == 0;
},
[&](Utf16View) { return to_byte_string() == RegexStringView { other }.to_byte_string(); },
[&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_byte_string(); },
[&](StringView view) { return view == RegexStringView { other }.to_byte_string(); });
}
bool operator==(Utf16View const& other) const
{
return m_view.visit(
[&](Utf32View) { return to_byte_string() == RegexStringView { other }.to_byte_string(); },
[&](Utf16View const& view) { return view == other; },
[&](Utf8View const& view) { return view.as_string() == RegexStringView { other }.to_byte_string(); },
[&](StringView view) { return view == RegexStringView { other }.to_byte_string(); });
}
bool operator==(Utf8View const& other) const
{
return m_view.visit(
[&](Utf32View) { return to_byte_string() == other.as_string(); },
[&](Utf16View) { return to_byte_string() == other.as_string(); },
[&](Utf8View const& view) { return view.as_string() == other.as_string(); },
[&](StringView view) { return other.as_string() == view; });
}
bool equals(RegexStringView other) const
{
return other.m_view.visit([this](auto const& view) { return operator==(view); });
@ -431,48 +301,15 @@ public:
bool starts_with(StringView str) const
{
return m_view.visit(
[&](Utf32View) -> bool {
TODO();
},
[&](Utf16View) -> bool {
TODO();
},
[&](Utf8View const& view) { return view.as_string().starts_with(str); },
[&](StringView view) { return view.starts_with(str); });
}
bool starts_with(Utf32View const& str) const
{
return m_view.visit(
[&](Utf32View view) -> bool {
if (str.length() > view.length())
return false;
if (str.length() == view.length())
return operator==(str);
for (size_t i = 0; i < str.length(); ++i) {
if (str.at(i) != view.at(i))
return false;
}
return true;
},
[&](Utf16View) -> bool { TODO(); },
[&](Utf8View const& view) {
auto it = view.begin();
for (auto code_point : str) {
if (it.done())
return false;
if (code_point != *it)
return false;
++it;
}
return true;
},
[&](StringView) -> bool { TODO(); });
}
private:
Variant<StringView, Utf8View, Utf16View, Utf32View> m_view { StringView {} };
bool m_unicode { false };
[[no_unique_address]] Variant<StringView, Utf16View> m_view { StringView {} };
[[no_unique_address]] bool m_unicode { false };
};
class Match final {

View file

@ -253,20 +253,11 @@ TEST_CASE(catch_all_again)
EXPECT_EQ(has_match("Hello World"sv, re), true);
}
TEST_CASE(char_utf8)
{
Regex<PosixExtended> re("😀");
RegexResult result;
EXPECT_EQ((result = match(Utf8View { "Привет, мир! 😀 γειά σου κόσμος 😀 こんにちは世界"sv }, re, PosixFlags::Global)).success, true);
EXPECT_EQ(result.count, 2u);
}
TEST_CASE(catch_all_newline)
{
Regex<PosixExtended> re("^.*$", PosixFlags::Multiline);
RegexResult result;
ByteString aaa = "Hello World\nTest\n1234\n";
String aaa = "Hello World\nTest\n1234\n"_string;
auto lambda = [&]() {
result = match(aaa, re);
EXPECT_EQ(result.success, true);
@ -283,7 +274,7 @@ TEST_CASE(catch_all_newline_view)
Regex<PosixExtended> re("^.*$", PosixFlags::Multiline);
RegexResult result;
ByteString aaa = "Hello World\nTest\n1234\n";
String aaa = "Hello World\nTest\n1234\n"_string;
result = match(aaa, re);
EXPECT_EQ(result.success, true);
EXPECT_EQ(result.count, 3u);
@ -313,7 +304,7 @@ TEST_CASE(catch_all_newline_2)
TEST_CASE(match_all_character_class)
{
Regex<PosixExtended> re("[[:alpha:]]");
ByteString str = "[Window]\nOpacity=255\nAudibleBeep=0\n";
String str = "[Window]\nOpacity=255\nAudibleBeep=0\n"_string;
RegexResult result = match(str, re, PosixFlags::Global);
EXPECT_EQ(result.success, true);
@ -326,7 +317,7 @@ TEST_CASE(match_all_character_class)
TEST_CASE(match_character_class_with_assertion)
{
Regex<PosixExtended> re("[[:alpha:]]+$");
ByteString str = "abcdef";
String str = "abcdef"_string;
RegexResult result = match(str, re);
EXPECT_EQ(result.success, true);
@ -421,7 +412,7 @@ TEST_CASE(named_capture_group)
regex_dbg.print_bytecode(re);
}
ByteString haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n"_string;
EXPECT_EQ(re.search(haystack, result, PosixFlags::Multiline), true);
EXPECT_EQ(result.count, 2u);
EXPECT_EQ(result.matches.at(0).view, "Opacity=255");
@ -444,7 +435,7 @@ TEST_CASE(ecma262_named_capture_group_with_dollar_sign)
regex_dbg.print_bytecode(re);
}
ByteString haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n";
String haystack = "[Window]\nOpacity=255\nAudibleBeep=0\n"_string;
EXPECT_EQ(re.search(haystack, result, ECMAScriptFlags::Multiline), true);
EXPECT_EQ(result.count, 2u);
EXPECT_EQ(result.matches.at(0).view, "Opacity=255");
@ -1009,7 +1000,7 @@ TEST_CASE(case_insensitive_match)
TEST_CASE(extremely_long_fork_chain)
{
Regex<ECMA262> re("(?:aa)*");
auto input = ByteString::repeated('a', 1000);
auto input = MUST(String::repeated('a', 1000));
auto result = re.match(input);
EXPECT_EQ(result.success, true);
}
@ -1037,7 +1028,7 @@ TEST_CASE(theoretically_infinite_loop)
}
}
static auto g_lots_of_a_s = ByteString::repeated('a', 10'000'000);
static auto g_lots_of_a_s = String::repeated('a', 10'000'000).release_value();
BENCHMARK_CASE(fork_performance)
{
@ -1048,12 +1039,12 @@ BENCHMARK_CASE(fork_performance)
}
{
Regex<ECMA262> re("(a+)+b");
auto result = re.match(g_lots_of_a_s.substring_view(0, 100));
auto result = re.match(g_lots_of_a_s.bytes_as_string_view().substring_view(0, 100));
EXPECT_EQ(result.success, false);
}
{
Regex<ECMA262> re("^(a|a?)+$");
auto input = ByteString::formatted("{}b", g_lots_of_a_s.substring_view(0, 100));
auto input = MUST(String::formatted("{}b", g_lots_of_a_s.bytes_as_string_view().substring_view(0, 100)));
auto result = re.match(input);
EXPECT_EQ(result.success, false);
}