From de588a97c011dbb6d4ee69bc37281870d49d3ce3 Mon Sep 17 00:00:00 2001 From: Gingeh <39150378+Gingeh@users.noreply.github.com> Date: Mon, 30 Sep 2024 12:49:15 +1000 Subject: [PATCH] LibRegex: Only search start of line if pattern begins with ^ --- Tests/LibRegex/Regex.cpp | 24 +++++++++++++++++++ Userland/Libraries/LibRegex/RegexMatcher.cpp | 3 ++- .../Libraries/LibRegex/RegexOptimizer.cpp | 6 +++++ Userland/Libraries/LibRegex/RegexParser.h | 1 + 4 files changed, 33 insertions(+), 1 deletion(-) diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index 63b7b911d3f..7ad236110b4 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -998,6 +998,15 @@ BENCHMARK_CASE(fork_performance) EXPECT_EQ(result.success, true); } +BENCHMARK_CASE(anchor_performance) +{ + Regex re("^b"); + for (auto i = 0; i < 100'000; i++) { + auto result = re.match(g_lots_of_a_s); + EXPECT_EQ(result.success, false); + } +} + TEST_CASE(optimizer_atomic_groups) { Array tests { @@ -1078,6 +1087,21 @@ TEST_CASE(optimizer_alternation) } } +TEST_CASE(start_anchor) +{ + // Ensure that a circumflex at the start only matches the start of the line. + { + Regex re("^abc"); + EXPECT_EQ(re.match("123abcdef"sv, PosixFlags::Global).success, false); + EXPECT_EQ(re.match("abc123"sv, PosixFlags::Global).success, true); + EXPECT_EQ(re.match("123^abcdef"sv, PosixFlags::Global).success, false); + EXPECT_EQ(re.match("^abc123"sv, PosixFlags::Global).success, false); + + // Multiple lines + EXPECT_EQ(re.match("123\nabc"sv, PosixFlags::Multiline).success, true); + } +} + TEST_CASE(posix_basic_dollar_is_end_anchor) { // Ensure that a dollar sign at the end only matches the end of the line. diff --git a/Userland/Libraries/LibRegex/RegexMatcher.cpp b/Userland/Libraries/LibRegex/RegexMatcher.cpp index df496bcb444..b105425eb41 100644 --- a/Userland/Libraries/LibRegex/RegexMatcher.cpp +++ b/Userland/Libraries/LibRegex/RegexMatcher.cpp @@ -184,6 +184,7 @@ RegexResult Matcher::match(Vector const& views, Optiona continue_search = false; auto single_match_only = input.regex_options.has_flag_set(AllFlags::SingleMatch); + auto only_start_of_line = m_pattern->parser_result.optimization_data.only_start_of_line && !input.regex_options.has_flag_set(AllFlags::Multiline); for (auto const& view : views) { if (lines_to_skip != 0) { @@ -294,7 +295,7 @@ RegexResult Matcher::match(Vector const& views, Optiona break; } - if (!continue_search) + if (!continue_search || only_start_of_line) break; } diff --git a/Userland/Libraries/LibRegex/RegexOptimizer.cpp b/Userland/Libraries/LibRegex/RegexOptimizer.cpp index 914821ce27d..75858ae64aa 100644 --- a/Userland/Libraries/LibRegex/RegexOptimizer.cpp +++ b/Userland/Libraries/LibRegex/RegexOptimizer.cpp @@ -36,6 +36,12 @@ void Regex::run_optimization_passes() // e.g. a*b -> (ATOMIC a*)b attempt_rewrite_loops_as_atomic_groups(blocks); + // FIXME: "There are a few more conditions this can be true in (e.g. within an arbitrarily nested capture group)" + MatchState state; + auto& opcode = parser_result.bytecode.get_opcode(state); + if (opcode.opcode_id() == OpCodeId::CheckBegin) + parser_result.optimization_data.only_start_of_line = true; + parser_result.bytecode.flatten(); } diff --git a/Userland/Libraries/LibRegex/RegexParser.h b/Userland/Libraries/LibRegex/RegexParser.h index bb01ce56423..41856b6113a 100644 --- a/Userland/Libraries/LibRegex/RegexParser.h +++ b/Userland/Libraries/LibRegex/RegexParser.h @@ -58,6 +58,7 @@ public: struct { Optional pure_substring_search; + bool only_start_of_line = false; } optimization_data {}; };