From 5b45223d5fe12d0a845d980a12db8444d1316734 Mon Sep 17 00:00:00 2001 From: Ali Mohammad Pur Date: Fri, 11 Jul 2025 15:06:40 +0200 Subject: [PATCH] LibRegex: Account for uppercase characters in insensitive patterns --- Libraries/LibRegex/RegexByteCode.cpp | 50 +++++++++++++++++---------- Libraries/LibRegex/RegexByteCode.h | 10 ++++-- Libraries/LibRegex/RegexMatcher.cpp | 8 ++++- Libraries/LibRegex/RegexOptimizer.cpp | 33 +++++++++++++----- Libraries/LibRegex/RegexParser.h | 1 + Tests/LibRegex/TestRegex.cpp | 2 ++ 6 files changed, 73 insertions(+), 31 deletions(-) diff --git a/Libraries/LibRegex/RegexByteCode.cpp b/Libraries/LibRegex/RegexByteCode.cpp index dbf1114a430..a3e272cc940 100644 --- a/Libraries/LibRegex/RegexByteCode.cpp +++ b/Libraries/LibRegex/RegexByteCode.cpp @@ -6,6 +6,7 @@ #include "RegexByteCode.h" #include "RegexDebug.h" + #include #include #include @@ -539,25 +540,24 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M if (input.view.length() <= state.string_position) return ExecutionResult::Failed_ExecuteLowPrioForks; - auto count = m_bytecode->at(offset++); - auto range_data = m_bytecode->flat_data().slice(offset, count); - offset += count; + auto count_sensitive = m_bytecode->at(offset++); + auto count_insensitive = m_bytecode->at(offset++); + auto sensitive_range_data = m_bytecode->flat_data().slice(offset, count_sensitive); + offset += count_sensitive; + auto insensitive_range_data = m_bytecode->flat_data().slice(offset, count_insensitive); + offset += count_insensitive; + + bool const insensitive = input.regex_options & AllFlags::Insensitive; auto ch = input.view[state.string_position_in_code_units]; + if (insensitive) + ch = to_ascii_lowercase(ch); - auto const* matching_range = binary_search(range_data, ch, nullptr, [insensitive = input.regex_options & AllFlags::Insensitive](auto needle, CharRange range) { - auto upper_case_needle = needle; - auto lower_case_needle = needle; - if (insensitive) { - upper_case_needle = to_ascii_uppercase(needle); - lower_case_needle = to_ascii_lowercase(needle); - } - - if (lower_case_needle >= range.from && lower_case_needle <= range.to) + auto const ranges = insensitive && !insensitive_range_data.is_empty() ? insensitive_range_data : sensitive_range_data; + auto const* matching_range = binary_search(ranges, ch, nullptr, [](auto needle, CharRange range) { + if (needle >= range.from && needle <= range.to) return 0; - if (upper_case_needle >= range.from && upper_case_needle <= range.to) - return 0; - if (lower_case_needle > range.to || upper_case_needle > range.to) + if (needle > range.to) return 1; return -1; }); @@ -934,9 +934,11 @@ Vector OpCode_Compare::flat_compares() const auto value = m_bytecode->at(offset++); result.append({ compare_type, value }); } else if (compare_type == CharacterCompareType::LookupTable) { - auto count = m_bytecode->at(offset++); - for (size_t i = 0; i < count; ++i) + auto count_sensitive = m_bytecode->at(offset++); + auto count_insensitive = m_bytecode->at(offset++); + for (size_t i = 0; i < count_sensitive; ++i) result.append({ CharacterCompareType::CharRange, m_bytecode->at(offset++) }); + offset += count_insensitive; // Skip insensitive ranges } else if (compare_type == CharacterCompareType::GeneralCategory || compare_type == CharacterCompareType::Property || compare_type == CharacterCompareType::Script @@ -1027,11 +1029,21 @@ Vector OpCode_Compare::variable_arguments_to_byte_string(Optional view.length() ? 0 : 1).to_byte_string())); } else if (compare_type == CharacterCompareType::LookupTable) { - auto count = m_bytecode->at(offset++); - for (size_t j = 0; j < count; ++j) { + auto count_sensitive = m_bytecode->at(offset++); + auto count_insensitive = m_bytecode->at(offset++); + for (size_t j = 0; j < count_sensitive; ++j) { auto range = (CharRange)m_bytecode->at(offset++); result.append(ByteString::formatted(" {:x}-{:x}", range.from, range.to)); } + if (count_insensitive > 0) { + result.append(" [insensitive ranges:"); + for (size_t j = 0; j < count_insensitive; ++j) { + auto range = (CharRange)m_bytecode->at(offset++); + result.append(ByteString::formatted(" {:x}-{:x}", range.from, range.to)); + } + result.append(" ]"); + } + if (!view.is_null() && view.length() > state().string_position) result.empend(ByteString::formatted( " compare against: '{}'", diff --git a/Libraries/LibRegex/RegexByteCode.h b/Libraries/LibRegex/RegexByteCode.h index 75d54293bd7..c18aa21b321 100644 --- a/Libraries/LibRegex/RegexByteCode.h +++ b/Libraries/LibRegex/RegexByteCode.h @@ -117,8 +117,8 @@ enum class BoundaryCheckType : ByteCodeValueType { }; struct CharRange { - u32 const from; - u32 const to; + u32 from; + u32 to; CharRange(u64 value) : from(value >> 32) @@ -213,6 +213,12 @@ public: Base::extend(other); } + template> T> + void extend(T other) + { + Base::append(move(other)); + } + template void empend(Args&&... args) { diff --git a/Libraries/LibRegex/RegexMatcher.cpp b/Libraries/LibRegex/RegexMatcher.cpp index 972dda82247..fc5fc858df1 100644 --- a/Libraries/LibRegex/RegexMatcher.cpp +++ b/Libraries/LibRegex/RegexMatcher.cpp @@ -288,8 +288,14 @@ RegexResult Matcher::match(Vector const& views, Optiona if (match_length_minimum && match_length_minimum > view_length - view_index) break; + auto const insensitive = input.regex_options.has_flag_set(AllFlags::Insensitive); if (auto& starting_ranges = m_pattern->parser_result.optimization_data.starting_ranges; !starting_ranges.is_empty()) { - if (!binary_search(starting_ranges, input.view.code_unit_at(view_index), nullptr, compare_range)) + auto ranges = insensitive ? m_pattern->parser_result.optimization_data.starting_ranges_insensitive.span() : starting_ranges.span(); + auto ch = input.view.code_unit_at(view_index); + if (insensitive) + ch = to_ascii_lowercase(ch); + + if (!binary_search(ranges, ch, nullptr, compare_range)) goto done_matching; } diff --git a/Libraries/LibRegex/RegexOptimizer.cpp b/Libraries/LibRegex/RegexOptimizer.cpp index e00d6688f58..fdb95e73bd2 100644 --- a/Libraries/LibRegex/RegexOptimizer.cpp +++ b/Libraries/LibRegex/RegexOptimizer.cpp @@ -223,8 +223,11 @@ void Regex::fill_optimization_data(BasicBlockList const& blocks) if (!compares.char_classes.is_empty() || !compares.negated_char_classes.is_empty() || !compares.negated_ranges.is_empty()) return; - for (auto it = compares.ranges.begin(); it != compares.ranges.end(); ++it) + for (auto it = compares.ranges.begin(); it != compares.ranges.end(); ++it) { parser_result.optimization_data.starting_ranges.append({ it.key(), *it }); + parser_result.optimization_data.starting_ranges_insensitive.append({ to_ascii_lowercase(it.key()), to_ascii_lowercase(*it) }); + quick_sort(parser_result.optimization_data.starting_ranges_insensitive, [](CharRange a, CharRange b) { return a.from < b.from; }); + } return; } case OpCodeId::CheckBegin: @@ -1863,10 +1866,13 @@ void Optimizer::append_character_class(ByteCode& target, Vector active_range; - size_t range_count = 0; + Vector range_data; for (auto& range : table) { if (!active_range.has_value()) { active_range = range; @@ -1876,16 +1882,25 @@ void Optimizer::append_character_class(ByteCode& target, Vectorto + 1 && range.to + 1 >= active_range->from) { active_range = CharRange { min(range.from, active_range->from), max(range.to, active_range->to) }; } else { - ++range_count; - arguments.append(active_range.release_value()); + range_data.append(active_range.release_value()); active_range = range; } } - if (active_range.has_value()) { - ++range_count; - arguments.append(active_range.release_value()); + if (active_range.has_value()) + range_data.append(active_range.release_value()); + arguments.extend(range_data); + arguments[sensitive_size_index] = range_data.size(); + + if (!all_of(range_data, [](CharRange range) { return range.from == to_ascii_lowercase(range.from) && range.to == to_ascii_lowercase(range.to); })) { + Vector insensitive_data; + insensitive_data.ensure_capacity(range_data.size()); + for (CharRange range : range_data) + insensitive_data.append(CharRange { to_ascii_lowercase(range.from), to_ascii_lowercase(range.to) }); + quick_sort(insensitive_data, [](CharRange a, CharRange b) { return a.from < b.from; }); + + arguments.extend(insensitive_data); + arguments[insensitive_size_index] = insensitive_data.size(); } - arguments[size_index] = range_count; }; auto contains_regular_table = !table.is_empty(); diff --git a/Libraries/LibRegex/RegexParser.h b/Libraries/LibRegex/RegexParser.h index c13f9e84c8a..f9d80081a39 100644 --- a/Libraries/LibRegex/RegexParser.h +++ b/Libraries/LibRegex/RegexParser.h @@ -66,6 +66,7 @@ public: Optional pure_substring_search; // If populated, the pattern only accepts strings that start with a character in these ranges. Vector starting_ranges; + Vector starting_ranges_insensitive; bool only_start_of_line = false; } optimization_data {}; }; diff --git a/Tests/LibRegex/TestRegex.cpp b/Tests/LibRegex/TestRegex.cpp index 762aa1f9511..8bb355106ab 100644 --- a/Tests/LibRegex/TestRegex.cpp +++ b/Tests/LibRegex/TestRegex.cpp @@ -736,6 +736,8 @@ TEST_CASE(ECMA262_match) { "(?!(b))\\1"sv, "a"sv, false }, // String table merge bug: inverse map should be merged regardless of available direct mappings. { "((?a)|(?b))"sv, "aa"sv, false }, + // Insensitive charclasses should accept upper/lowercase in pattern (lookup table should still be ordered if insensitive lookup is used), ladybird#5399. + { "[aBc]"sv, "b"sv, true, ECMAScriptFlags::Insensitive }, }; for (auto& test : tests) {