From 446a453719bf1a10dd0076c8ad5d2e4d47b7775c Mon Sep 17 00:00:00 2001 From: Ali Mohammad Pur Date: Tue, 15 Apr 2025 21:32:26 +0200 Subject: [PATCH] LibRegex: Pull out the first compare to avoid unnecessary execution This adds a fast-path to drop view indices we know will not match immediately without going through the regex VM. --- Libraries/LibRegex/RegexMatcher.cpp | 35 ++- Libraries/LibRegex/RegexMatcher.h | 1 + Libraries/LibRegex/RegexOptimizer.cpp | 329 +++++++++++++++++--------- Libraries/LibRegex/RegexParser.h | 2 + 4 files changed, 250 insertions(+), 117 deletions(-) diff --git a/Libraries/LibRegex/RegexMatcher.cpp b/Libraries/LibRegex/RegexMatcher.cpp index fed306ce706..ee2b2a29183 100644 --- a/Libraries/LibRegex/RegexMatcher.cpp +++ b/Libraries/LibRegex/RegexMatcher.cpp @@ -4,6 +4,7 @@ * SPDX-License-Identifier: BSD-2-Clause */ +#include #include #include #include @@ -208,6 +209,23 @@ RegexResult Matcher::match(Vector const& views, Optiona auto single_match_only = input.regex_options.has_flag_set(AllFlags::SingleMatch); auto only_start_of_line = m_pattern->parser_result.optimization_data.only_start_of_line && !input.regex_options.has_flag_set(AllFlags::Multiline); + auto compare_range = [insensitive = input.regex_options & AllFlags::Insensitive](auto needle, CharRange range) { + auto upper_case_needle = needle; + auto lower_case_needle = needle; + if (insensitive) { + upper_case_needle = to_ascii_uppercase(needle); + lower_case_needle = to_ascii_lowercase(needle); + } + + if (lower_case_needle >= range.from && lower_case_needle <= range.to) + return 0; + if (upper_case_needle >= range.from && upper_case_needle <= range.to) + return 0; + if (lower_case_needle > range.to || upper_case_needle > range.to) + return 1; + return -1; + }; + for (auto const& view : views) { if (lines_to_skip != 0) { ++input.line; @@ -253,19 +271,26 @@ RegexResult Matcher::match(Vector const& views, Optiona } for (; view_index <= view_length; ++view_index) { - if (view_index == view_length && input.regex_options.has_flag_set(AllFlags::Multiline)) - break; + if (view_index == view_length) { + if (input.regex_options.has_flag_set(AllFlags::Multiline)) + break; + } - auto& match_length_minimum = m_pattern->parser_result.match_length_minimum; // FIXME: More performant would be to know the remaining minimum string // length needed to match from the current position onwards within // the vm. Add new OpCode for MinMatchLengthFromSp with the value of // the remaining string length from the current path. The value though // has to be filled in reverse. That implies a second run over bytecode // after generation has finished. + auto const match_length_minimum = m_pattern->parser_result.match_length_minimum; if (match_length_minimum && match_length_minimum > view_length - view_index) break; + if (auto& starting_ranges = m_pattern->parser_result.optimization_data.starting_ranges; !starting_ranges.is_empty()) { + if (!binary_search(starting_ranges, input.view.code_unit_at(view_index), nullptr, compare_range)) + goto done_matching; + } + input.column = match_count; input.match_index = match_count; @@ -274,8 +299,7 @@ RegexResult Matcher::match(Vector const& views, Optiona state.instruction_position = 0; state.repetition_marks.clear(); - auto success = execute(input, state, operations); - if (success) { + if (execute(input, state, operations)) { succeeded = true; if (input.regex_options.has_flag_set(AllFlags::MatchNotEndOfLine) && state.string_position == input.view.length()) { @@ -315,6 +339,7 @@ RegexResult Matcher::match(Vector const& views, Optiona break; } + done_matching: if (!continue_search || only_start_of_line) break; } diff --git a/Libraries/LibRegex/RegexMatcher.h b/Libraries/LibRegex/RegexMatcher.h index 4332be0b4bb..c054e444a20 100644 --- a/Libraries/LibRegex/RegexMatcher.h +++ b/Libraries/LibRegex/RegexMatcher.h @@ -230,6 +230,7 @@ private: void run_optimization_passes(); void attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&); bool attempt_rewrite_entire_match_as_substring_search(BasicBlockList const&); + void fill_optimization_data(BasicBlockList const&); }; // free standing functions for match, search and has_match diff --git a/Libraries/LibRegex/RegexOptimizer.cpp b/Libraries/LibRegex/RegexOptimizer.cpp index b84462e71b0..5e69c065cf4 100644 --- a/Libraries/LibRegex/RegexOptimizer.cpp +++ b/Libraries/LibRegex/RegexOptimizer.cpp @@ -36,15 +36,211 @@ void Regex::run_optimization_passes() // e.g. a*b -> (ATOMIC a*)b attempt_rewrite_loops_as_atomic_groups(blocks); - // FIXME: "There are a few more conditions this can be true in (e.g. within an arbitrarily nested capture group)" - auto state = MatchState::only_for_enumeration(); - auto& opcode = parser_result.bytecode.get_opcode(state); - if (opcode.opcode_id() == OpCodeId::CheckBegin) - parser_result.optimization_data.only_start_of_line = true; + fill_optimization_data(split_basic_blocks(parser_result.bytecode)); parser_result.bytecode.flatten(); } +struct StaticallyInterpretedCompares { + RedBlackTree lhs_ranges; + RedBlackTree lhs_negated_ranges; + HashTable lhs_char_classes; + HashTable lhs_negated_char_classes; + + bool has_any_unicode_property = false; + HashTable lhs_unicode_general_categories; + HashTable lhs_unicode_properties; + HashTable lhs_unicode_scripts; + HashTable lhs_unicode_script_extensions; + HashTable lhs_negated_unicode_general_categories; + HashTable lhs_negated_unicode_properties; + HashTable lhs_negated_unicode_scripts; + HashTable lhs_negated_unicode_script_extensions; +}; + +static bool interpret_compares(Vector const& lhs, StaticallyInterpretedCompares& compares) +{ + bool inverse { false }; + bool temporary_inverse { false }; + bool reset_temporary_inverse { false }; + + auto current_lhs_inversion_state = [&]() -> bool { return temporary_inverse ^ inverse; }; + + auto& lhs_ranges = compares.lhs_ranges; + auto& lhs_negated_ranges = compares.lhs_negated_ranges; + auto& lhs_char_classes = compares.lhs_char_classes; + auto& lhs_negated_char_classes = compares.lhs_negated_char_classes; + auto& has_any_unicode_property = compares.has_any_unicode_property; + auto& lhs_unicode_general_categories = compares.lhs_unicode_general_categories; + auto& lhs_unicode_properties = compares.lhs_unicode_properties; + auto& lhs_unicode_scripts = compares.lhs_unicode_scripts; + auto& lhs_unicode_script_extensions = compares.lhs_unicode_script_extensions; + auto& lhs_negated_unicode_general_categories = compares.lhs_negated_unicode_general_categories; + auto& lhs_negated_unicode_properties = compares.lhs_negated_unicode_properties; + auto& lhs_negated_unicode_scripts = compares.lhs_negated_unicode_scripts; + auto& lhs_negated_unicode_script_extensions = compares.lhs_negated_unicode_script_extensions; + + for (auto const& pair : lhs) { + if (reset_temporary_inverse) { + reset_temporary_inverse = false; + temporary_inverse = false; + } else { + reset_temporary_inverse = true; + } + + switch (pair.type) { + case CharacterCompareType::Inverse: + inverse = !inverse; + break; + case CharacterCompareType::TemporaryInverse: + temporary_inverse = true; + reset_temporary_inverse = false; + break; + case CharacterCompareType::AnyChar: + // Special case: if not inverted, AnyChar is always in the range. + if (!current_lhs_inversion_state()) + return false; + break; + case CharacterCompareType::Char: + if (!current_lhs_inversion_state()) + lhs_ranges.insert(pair.value, pair.value); + else + lhs_negated_ranges.insert(pair.value, pair.value); + break; + case CharacterCompareType::String: + // FIXME: We just need to look at the last character of this string, but we only have the first character here. + // Just bail out to avoid false positives. + return false; + case CharacterCompareType::CharClass: + if (!current_lhs_inversion_state()) + lhs_char_classes.set(static_cast(pair.value)); + else + lhs_negated_char_classes.set(static_cast(pair.value)); + break; + case CharacterCompareType::CharRange: { + auto range = CharRange(pair.value); + if (!current_lhs_inversion_state()) + lhs_ranges.insert(range.from, range.to); + else + lhs_negated_ranges.insert(range.from, range.to); + break; + } + case CharacterCompareType::LookupTable: + // We've transformed this into a series of ranges in flat_compares(), so bail out if we see it. + return false; + case CharacterCompareType::Reference: + // We've handled this before coming here. + break; + case CharacterCompareType::Property: + has_any_unicode_property = true; + if (!current_lhs_inversion_state()) + lhs_unicode_properties.set(static_cast(pair.value)); + else + lhs_negated_unicode_properties.set(static_cast(pair.value)); + break; + case CharacterCompareType::GeneralCategory: + has_any_unicode_property = true; + if (!current_lhs_inversion_state()) + lhs_unicode_general_categories.set(static_cast(pair.value)); + else + lhs_negated_unicode_general_categories.set(static_cast(pair.value)); + break; + case CharacterCompareType::Script: + has_any_unicode_property = true; + if (!current_lhs_inversion_state()) + lhs_unicode_scripts.set(static_cast(pair.value)); + else + lhs_negated_unicode_scripts.set(static_cast(pair.value)); + break; + case CharacterCompareType::ScriptExtension: + has_any_unicode_property = true; + if (!current_lhs_inversion_state()) + lhs_unicode_script_extensions.set(static_cast(pair.value)); + else + lhs_negated_unicode_script_extensions.set(static_cast(pair.value)); + break; + case CharacterCompareType::Or: + case CharacterCompareType::EndAndOr: + // These are the default behaviour for [...], so we don't need to do anything (unless we add support for 'And' below). + break; + case CharacterCompareType::And: + // FIXME: These are too difficult to handle, so bail out. + return false; + case CharacterCompareType::Undefined: + case CharacterCompareType::RangeExpressionDummy: + // These do not occur in valid bytecode. + VERIFY_NOT_REACHED(); + } + } + + return true; +} + +template +void Regex::fill_optimization_data(BasicBlockList const& blocks) +{ + if (blocks.is_empty()) + return; + + if constexpr (REGEX_DEBUG) { + dbgln("Pulling out optimization data from bytecode:"); + RegexDebug dbg; + dbg.print_bytecode(*this); + for (auto const& block : blocks) + dbgln("block from {} to {} (comment: {})", block.start, block.end, block.comment); + } + + ScopeGuard print = [&] { + if constexpr (REGEX_DEBUG) { + dbgln("Optimization data:"); + if (parser_result.optimization_data.starting_ranges.is_empty()) + dbgln("; - no starting ranges"); + for (auto const& range : parser_result.optimization_data.starting_ranges) + dbgln(" - starting range: {}-{}", range.from, range.to); + dbgln("; - only start of line: {}", parser_result.optimization_data.only_start_of_line); + } + }; + + auto& bytecode = parser_result.bytecode; + + auto state = MatchState::only_for_enumeration(); + auto block = blocks.first(); + for (state.instruction_position = block.start; state.instruction_position < block.end;) { + auto& opcode = bytecode.get_opcode(state); + switch (opcode.opcode_id()) { + case OpCodeId::Compare: { + auto flat_compares = static_cast(opcode).flat_compares(); + StaticallyInterpretedCompares compares; + if (!interpret_compares(flat_compares, compares)) + return; // No idea, the bytecode is too complex. + + if (compares.has_any_unicode_property) + return; // Faster to just run the bytecode. + + // FIXME: We should be able to handle these cases (jump ahead while...) + if (!compares.lhs_char_classes.is_empty() || !compares.lhs_negated_char_classes.is_empty() || !compares.lhs_negated_ranges.is_empty()) + return; + + for (auto it = compares.lhs_ranges.begin(); it != compares.lhs_ranges.end(); ++it) + parser_result.optimization_data.starting_ranges.append({ it.key(), *it }); + return; + } + case OpCodeId::CheckBegin: + parser_result.optimization_data.only_start_of_line = true; + return; + case OpCodeId::Checkpoint: + case OpCodeId::Save: + case OpCodeId::ClearCaptureGroup: + case OpCodeId::SaveLeftCaptureGroup: + // These do not 'match' anything, so look through them. + state.instruction_position += opcode.size(); + continue; + default: + return; + } + } +} + template typename Regex::BasicBlockList Regex::split_basic_blocks(ByteCode const& bytecode) { @@ -126,7 +322,6 @@ typename Regex::BasicBlockList Regex::split_basic_blocks(ByteCod static bool has_overlap(Vector const& lhs, Vector const& rhs) { - // We have to fully interpret the two sequences to determine if they overlap (that is, keep track of inversion state and what ranges they cover). bool inverse { false }; bool temporary_inverse { false }; @@ -134,20 +329,20 @@ static bool has_overlap(Vector const& lhs, Vector bool { return temporary_inverse ^ inverse; }; - RedBlackTree lhs_ranges; - RedBlackTree lhs_negated_ranges; - HashTable lhs_char_classes; - HashTable lhs_negated_char_classes; - - auto has_any_unicode_property = false; - HashTable lhs_unicode_general_categories; - HashTable lhs_unicode_properties; - HashTable lhs_unicode_scripts; - HashTable lhs_unicode_script_extensions; - HashTable lhs_negated_unicode_general_categories; - HashTable lhs_negated_unicode_properties; - HashTable lhs_negated_unicode_scripts; - HashTable lhs_negated_unicode_script_extensions; + StaticallyInterpretedCompares compares; + auto& lhs_ranges = compares.lhs_ranges; + auto& lhs_negated_ranges = compares.lhs_negated_ranges; + auto& lhs_char_classes = compares.lhs_char_classes; + auto& lhs_negated_char_classes = compares.lhs_negated_char_classes; + auto& has_any_unicode_property = compares.has_any_unicode_property; + auto& lhs_unicode_general_categories = compares.lhs_unicode_general_categories; + auto& lhs_unicode_properties = compares.lhs_unicode_properties; + auto& lhs_unicode_scripts = compares.lhs_unicode_scripts; + auto& lhs_unicode_script_extensions = compares.lhs_unicode_script_extensions; + auto& lhs_negated_unicode_general_categories = compares.lhs_negated_unicode_general_categories; + auto& lhs_negated_unicode_properties = compares.lhs_negated_unicode_properties; + auto& lhs_negated_unicode_scripts = compares.lhs_negated_unicode_scripts; + auto& lhs_negated_unicode_script_extensions = compares.lhs_negated_unicode_script_extensions; auto any_unicode_property_matches = [&](u32 code_point) { if (any_of(lhs_negated_unicode_general_categories, [code_point](auto category) { return Unicode::code_point_has_general_category(code_point, category); })) @@ -214,98 +409,8 @@ static bool has_overlap(Vector const& lhs, Vector(pair.value)); - else - lhs_negated_char_classes.set(static_cast(pair.value)); - break; - case CharacterCompareType::CharRange: { - auto range = CharRange(pair.value); - if (!current_lhs_inversion_state()) - lhs_ranges.insert(range.from, range.to); - else - lhs_negated_ranges.insert(range.from, range.to); - break; - } - case CharacterCompareType::LookupTable: - // We've transformed this into a series of ranges in flat_compares(), so bail out if we see it. - return true; - case CharacterCompareType::Reference: - // We've handled this before coming here. - break; - case CharacterCompareType::Property: - has_any_unicode_property = true; - if (!current_lhs_inversion_state()) - lhs_unicode_properties.set(static_cast(pair.value)); - else - lhs_negated_unicode_properties.set(static_cast(pair.value)); - break; - case CharacterCompareType::GeneralCategory: - has_any_unicode_property = true; - if (!current_lhs_inversion_state()) - lhs_unicode_general_categories.set(static_cast(pair.value)); - else - lhs_negated_unicode_general_categories.set(static_cast(pair.value)); - break; - case CharacterCompareType::Script: - has_any_unicode_property = true; - if (!current_lhs_inversion_state()) - lhs_unicode_scripts.set(static_cast(pair.value)); - else - lhs_negated_unicode_scripts.set(static_cast(pair.value)); - break; - case CharacterCompareType::ScriptExtension: - has_any_unicode_property = true; - if (!current_lhs_inversion_state()) - lhs_unicode_script_extensions.set(static_cast(pair.value)); - else - lhs_negated_unicode_script_extensions.set(static_cast(pair.value)); - break; - case CharacterCompareType::Or: - case CharacterCompareType::EndAndOr: - // These are the default behaviour for [...], so we don't need to do anything (unless we add support for 'And' below). - break; - case CharacterCompareType::And: - // FIXME: These are too difficult to handle, so bail out. - return true; - case CharacterCompareType::Undefined: - case CharacterCompareType::RangeExpressionDummy: - // These do not occur in valid bytecode. - VERIFY_NOT_REACHED(); - } - } + if (!interpret_compares(lhs, compares)) + return true; // We can't interpret this, so we can't optimize it. if constexpr (REGEX_DEBUG) { dbgln("lhs ranges:"); diff --git a/Libraries/LibRegex/RegexParser.h b/Libraries/LibRegex/RegexParser.h index ae9b97875bb..5e5ae28bcb7 100644 --- a/Libraries/LibRegex/RegexParser.h +++ b/Libraries/LibRegex/RegexParser.h @@ -64,6 +64,8 @@ public: struct { Optional pure_substring_search; + // If populated, the pattern only accepts strings that start with a character in these ranges. + Vector starting_ranges; bool only_start_of_line = false; } optimization_data {}; };