diff --git a/Libraries/LibRegex/RegexMatch.h b/Libraries/LibRegex/RegexMatch.h index 1593647fbe8..96c41b0df99 100644 --- a/Libraries/LibRegex/RegexMatch.h +++ b/Libraries/LibRegex/RegexMatch.h @@ -561,6 +561,33 @@ struct MatchState { COWVector> capture_group_matches; COWVector repetition_marks; Vector checkpoints; + + // For size_t in {0..100}, ips in {0..500} and repetitions in {0..30}, there are zero collisions. + // For the full range, zero collisions were found in 8 million random samples. + u64 u64_hash() const + { + u64 hash = 0xcbf29ce484222325; + auto combine = [&hash](auto value) { + hash ^= value + 0x9e3779b97f4a7c15 + (hash << 6) + (hash >> 2); + }; + auto combine_vector = [&hash](auto const& vector) { + for (auto& value : vector) { + hash ^= value; + hash *= 0x100000001b3; + } + }; + + combine(string_position_before_match); + combine(string_position); + combine(string_position_in_code_units); + combine(instruction_position); + combine(fork_at_position); + combine(initiating_fork.value_or(0) + initiating_fork.has_value()); + combine_vector(repetition_marks); + combine_vector(checkpoints); + + return hash; + } }; } diff --git a/Libraries/LibRegex/RegexMatcher.cpp b/Libraries/LibRegex/RegexMatcher.cpp index 22611dbe057..60394c57023 100644 --- a/Libraries/LibRegex/RegexMatcher.cpp +++ b/Libraries/LibRegex/RegexMatcher.cpp @@ -468,6 +468,7 @@ bool Matcher::execute(MatchInput const& input, MatchState& state, size_t } BumpAllocatedLinkedList states_to_try_next; + HashTable seen_state_hashes; #if REGEX_DEBUG size_t recursion_level = 0; #endif @@ -545,17 +546,34 @@ bool Matcher::execute(MatchInput const& input, MatchState& state, size_t continue; case ExecutionResult::Succeeded: return true; - case ExecutionResult::Failed: - if (!states_to_try_next.is_empty()) { + case ExecutionResult::Failed: { + bool found = false; + while (!states_to_try_next.is_empty()) { state = states_to_try_next.take_last(); + if (auto hash = state.u64_hash(); seen_state_hashes.set(hash) != HashSetResult::InsertedNewEntry) { + dbgln_if(REGEX_DEBUG, "Already seen state, skipping: {}", hash); + continue; + } + found = true; + break; + } + if (found) continue; - } return false; + } case ExecutionResult::Failed_ExecuteLowPrioForks: { - if (states_to_try_next.is_empty()) { - return false; + bool found = false; + while (!states_to_try_next.is_empty()) { + state = states_to_try_next.take_last(); + if (auto hash = state.u64_hash(); seen_state_hashes.set(hash) != HashSetResult::InsertedNewEntry) { + dbgln_if(REGEX_DEBUG, "Already seen state, skipping: {}", hash); + continue; + } + found = true; + break; } - state = states_to_try_next.take_last(); + if (!found) + return false; #if REGEX_DEBUG ++recursion_level; #endif diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index c27fde606c6..d0ac65110a0 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -1004,9 +1004,21 @@ static auto g_lots_of_a_s = ByteString::repeated('a', 10'000'000); BENCHMARK_CASE(fork_performance) { - Regex re("(?:aa)*"); - auto result = re.match(g_lots_of_a_s); - EXPECT_EQ(result.success, true); + { + Regex re("(?:aa)*"); + auto result = re.match(g_lots_of_a_s); + EXPECT_EQ(result.success, true); + } + { + Regex re("(a+)+b"); + auto result = re.match(g_lots_of_a_s.substring_view(0, 100)); + EXPECT_EQ(result.success, false); + } + { + Regex re("^(a|a?)+$"); + auto result = re.match(ByteString::formatted("{}b", g_lots_of_a_s.substring_view(0, 100))); + EXPECT_EQ(result.success, false); + } } BENCHMARK_CASE(anchor_performance)