mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-08-17 15:59:43 +00:00
LibRegex: Account for uppercase characters in insensitive patterns
This commit is contained in:
parent
31e8189f9f
commit
5b45223d5f
Notes:
github-actions[bot]
2025-07-12 09:27:33 +00:00
Author: https://github.com/alimpfard
Commit: 5b45223d5f
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5403
Reviewed-by: https://github.com/gmta ✅
6 changed files with 73 additions and 31 deletions
|
@ -6,6 +6,7 @@
|
||||||
|
|
||||||
#include "RegexByteCode.h"
|
#include "RegexByteCode.h"
|
||||||
#include "RegexDebug.h"
|
#include "RegexDebug.h"
|
||||||
|
|
||||||
#include <AK/BinarySearch.h>
|
#include <AK/BinarySearch.h>
|
||||||
#include <AK/CharacterTypes.h>
|
#include <AK/CharacterTypes.h>
|
||||||
#include <AK/StringBuilder.h>
|
#include <AK/StringBuilder.h>
|
||||||
|
@ -539,25 +540,24 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
|
||||||
if (input.view.length() <= state.string_position)
|
if (input.view.length() <= state.string_position)
|
||||||
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
return ExecutionResult::Failed_ExecuteLowPrioForks;
|
||||||
|
|
||||||
auto count = m_bytecode->at(offset++);
|
auto count_sensitive = m_bytecode->at(offset++);
|
||||||
auto range_data = m_bytecode->flat_data().slice(offset, count);
|
auto count_insensitive = m_bytecode->at(offset++);
|
||||||
offset += count;
|
auto sensitive_range_data = m_bytecode->flat_data().slice(offset, count_sensitive);
|
||||||
|
offset += count_sensitive;
|
||||||
|
auto insensitive_range_data = m_bytecode->flat_data().slice(offset, count_insensitive);
|
||||||
|
offset += count_insensitive;
|
||||||
|
|
||||||
|
bool const insensitive = input.regex_options & AllFlags::Insensitive;
|
||||||
|
|
||||||
auto ch = input.view[state.string_position_in_code_units];
|
auto ch = input.view[state.string_position_in_code_units];
|
||||||
|
if (insensitive)
|
||||||
|
ch = to_ascii_lowercase(ch);
|
||||||
|
|
||||||
auto const* matching_range = binary_search(range_data, ch, nullptr, [insensitive = input.regex_options & AllFlags::Insensitive](auto needle, CharRange range) {
|
auto const ranges = insensitive && !insensitive_range_data.is_empty() ? insensitive_range_data : sensitive_range_data;
|
||||||
auto upper_case_needle = needle;
|
auto const* matching_range = binary_search(ranges, ch, nullptr, [](auto needle, CharRange range) {
|
||||||
auto lower_case_needle = needle;
|
if (needle >= range.from && needle <= range.to)
|
||||||
if (insensitive) {
|
|
||||||
upper_case_needle = to_ascii_uppercase(needle);
|
|
||||||
lower_case_needle = to_ascii_lowercase(needle);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (lower_case_needle >= range.from && lower_case_needle <= range.to)
|
|
||||||
return 0;
|
return 0;
|
||||||
if (upper_case_needle >= range.from && upper_case_needle <= range.to)
|
if (needle > range.to)
|
||||||
return 0;
|
|
||||||
if (lower_case_needle > range.to || upper_case_needle > range.to)
|
|
||||||
return 1;
|
return 1;
|
||||||
return -1;
|
return -1;
|
||||||
});
|
});
|
||||||
|
@ -934,9 +934,11 @@ Vector<CompareTypeAndValuePair> OpCode_Compare::flat_compares() const
|
||||||
auto value = m_bytecode->at(offset++);
|
auto value = m_bytecode->at(offset++);
|
||||||
result.append({ compare_type, value });
|
result.append({ compare_type, value });
|
||||||
} else if (compare_type == CharacterCompareType::LookupTable) {
|
} else if (compare_type == CharacterCompareType::LookupTable) {
|
||||||
auto count = m_bytecode->at(offset++);
|
auto count_sensitive = m_bytecode->at(offset++);
|
||||||
for (size_t i = 0; i < count; ++i)
|
auto count_insensitive = m_bytecode->at(offset++);
|
||||||
|
for (size_t i = 0; i < count_sensitive; ++i)
|
||||||
result.append({ CharacterCompareType::CharRange, m_bytecode->at(offset++) });
|
result.append({ CharacterCompareType::CharRange, m_bytecode->at(offset++) });
|
||||||
|
offset += count_insensitive; // Skip insensitive ranges
|
||||||
} else if (compare_type == CharacterCompareType::GeneralCategory
|
} else if (compare_type == CharacterCompareType::GeneralCategory
|
||||||
|| compare_type == CharacterCompareType::Property
|
|| compare_type == CharacterCompareType::Property
|
||||||
|| compare_type == CharacterCompareType::Script
|
|| compare_type == CharacterCompareType::Script
|
||||||
|
@ -1027,11 +1029,21 @@ Vector<ByteString> OpCode_Compare::variable_arguments_to_byte_string(Optional<Ma
|
||||||
" compare against: '{}'",
|
" compare against: '{}'",
|
||||||
input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_byte_string()));
|
input.value().view.substring_view(string_start_offset, state().string_position > view.length() ? 0 : 1).to_byte_string()));
|
||||||
} else if (compare_type == CharacterCompareType::LookupTable) {
|
} else if (compare_type == CharacterCompareType::LookupTable) {
|
||||||
auto count = m_bytecode->at(offset++);
|
auto count_sensitive = m_bytecode->at(offset++);
|
||||||
for (size_t j = 0; j < count; ++j) {
|
auto count_insensitive = m_bytecode->at(offset++);
|
||||||
|
for (size_t j = 0; j < count_sensitive; ++j) {
|
||||||
auto range = (CharRange)m_bytecode->at(offset++);
|
auto range = (CharRange)m_bytecode->at(offset++);
|
||||||
result.append(ByteString::formatted(" {:x}-{:x}", range.from, range.to));
|
result.append(ByteString::formatted(" {:x}-{:x}", range.from, range.to));
|
||||||
}
|
}
|
||||||
|
if (count_insensitive > 0) {
|
||||||
|
result.append(" [insensitive ranges:");
|
||||||
|
for (size_t j = 0; j < count_insensitive; ++j) {
|
||||||
|
auto range = (CharRange)m_bytecode->at(offset++);
|
||||||
|
result.append(ByteString::formatted(" {:x}-{:x}", range.from, range.to));
|
||||||
|
}
|
||||||
|
result.append(" ]");
|
||||||
|
}
|
||||||
|
|
||||||
if (!view.is_null() && view.length() > state().string_position)
|
if (!view.is_null() && view.length() > state().string_position)
|
||||||
result.empend(ByteString::formatted(
|
result.empend(ByteString::formatted(
|
||||||
" compare against: '{}'",
|
" compare against: '{}'",
|
||||||
|
|
|
@ -117,8 +117,8 @@ enum class BoundaryCheckType : ByteCodeValueType {
|
||||||
};
|
};
|
||||||
|
|
||||||
struct CharRange {
|
struct CharRange {
|
||||||
u32 const from;
|
u32 from;
|
||||||
u32 const to;
|
u32 to;
|
||||||
|
|
||||||
CharRange(u64 value)
|
CharRange(u64 value)
|
||||||
: from(value >> 32)
|
: from(value >> 32)
|
||||||
|
@ -213,6 +213,12 @@ public:
|
||||||
Base::extend(other);
|
Base::extend(other);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<SameAs<Vector<ByteCodeValueType>> T>
|
||||||
|
void extend(T other)
|
||||||
|
{
|
||||||
|
Base::append(move(other));
|
||||||
|
}
|
||||||
|
|
||||||
template<typename... Args>
|
template<typename... Args>
|
||||||
void empend(Args&&... args)
|
void empend(Args&&... args)
|
||||||
{
|
{
|
||||||
|
|
|
@ -288,8 +288,14 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
|
||||||
if (match_length_minimum && match_length_minimum > view_length - view_index)
|
if (match_length_minimum && match_length_minimum > view_length - view_index)
|
||||||
break;
|
break;
|
||||||
|
|
||||||
|
auto const insensitive = input.regex_options.has_flag_set(AllFlags::Insensitive);
|
||||||
if (auto& starting_ranges = m_pattern->parser_result.optimization_data.starting_ranges; !starting_ranges.is_empty()) {
|
if (auto& starting_ranges = m_pattern->parser_result.optimization_data.starting_ranges; !starting_ranges.is_empty()) {
|
||||||
if (!binary_search(starting_ranges, input.view.code_unit_at(view_index), nullptr, compare_range))
|
auto ranges = insensitive ? m_pattern->parser_result.optimization_data.starting_ranges_insensitive.span() : starting_ranges.span();
|
||||||
|
auto ch = input.view.code_unit_at(view_index);
|
||||||
|
if (insensitive)
|
||||||
|
ch = to_ascii_lowercase(ch);
|
||||||
|
|
||||||
|
if (!binary_search(ranges, ch, nullptr, compare_range))
|
||||||
goto done_matching;
|
goto done_matching;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -223,8 +223,11 @@ void Regex<Parser>::fill_optimization_data(BasicBlockList const& blocks)
|
||||||
if (!compares.char_classes.is_empty() || !compares.negated_char_classes.is_empty() || !compares.negated_ranges.is_empty())
|
if (!compares.char_classes.is_empty() || !compares.negated_char_classes.is_empty() || !compares.negated_ranges.is_empty())
|
||||||
return;
|
return;
|
||||||
|
|
||||||
for (auto it = compares.ranges.begin(); it != compares.ranges.end(); ++it)
|
for (auto it = compares.ranges.begin(); it != compares.ranges.end(); ++it) {
|
||||||
parser_result.optimization_data.starting_ranges.append({ it.key(), *it });
|
parser_result.optimization_data.starting_ranges.append({ it.key(), *it });
|
||||||
|
parser_result.optimization_data.starting_ranges_insensitive.append({ to_ascii_lowercase(it.key()), to_ascii_lowercase(*it) });
|
||||||
|
quick_sort(parser_result.optimization_data.starting_ranges_insensitive, [](CharRange a, CharRange b) { return a.from < b.from; });
|
||||||
|
}
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
case OpCodeId::CheckBegin:
|
case OpCodeId::CheckBegin:
|
||||||
|
@ -1863,10 +1866,13 @@ void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndVa
|
||||||
auto append_table = [&](auto& table) {
|
auto append_table = [&](auto& table) {
|
||||||
++argument_count;
|
++argument_count;
|
||||||
arguments.append(to_underlying(CharacterCompareType::LookupTable));
|
arguments.append(to_underlying(CharacterCompareType::LookupTable));
|
||||||
auto size_index = arguments.size();
|
auto sensitive_size_index = arguments.size();
|
||||||
|
auto insensitive_size_index = sensitive_size_index + 1;
|
||||||
arguments.append(0);
|
arguments.append(0);
|
||||||
|
arguments.append(0);
|
||||||
|
|
||||||
Optional<CharRange> active_range;
|
Optional<CharRange> active_range;
|
||||||
size_t range_count = 0;
|
Vector<ByteCodeValueType> range_data;
|
||||||
for (auto& range : table) {
|
for (auto& range : table) {
|
||||||
if (!active_range.has_value()) {
|
if (!active_range.has_value()) {
|
||||||
active_range = range;
|
active_range = range;
|
||||||
|
@ -1876,16 +1882,25 @@ void Optimizer::append_character_class(ByteCode& target, Vector<CompareTypeAndVa
|
||||||
if (range.from <= active_range->to + 1 && range.to + 1 >= active_range->from) {
|
if (range.from <= active_range->to + 1 && range.to + 1 >= active_range->from) {
|
||||||
active_range = CharRange { min(range.from, active_range->from), max(range.to, active_range->to) };
|
active_range = CharRange { min(range.from, active_range->from), max(range.to, active_range->to) };
|
||||||
} else {
|
} else {
|
||||||
++range_count;
|
range_data.append(active_range.release_value());
|
||||||
arguments.append(active_range.release_value());
|
|
||||||
active_range = range;
|
active_range = range;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (active_range.has_value()) {
|
if (active_range.has_value())
|
||||||
++range_count;
|
range_data.append(active_range.release_value());
|
||||||
arguments.append(active_range.release_value());
|
arguments.extend(range_data);
|
||||||
|
arguments[sensitive_size_index] = range_data.size();
|
||||||
|
|
||||||
|
if (!all_of(range_data, [](CharRange range) { return range.from == to_ascii_lowercase(range.from) && range.to == to_ascii_lowercase(range.to); })) {
|
||||||
|
Vector<ByteCodeValueType> insensitive_data;
|
||||||
|
insensitive_data.ensure_capacity(range_data.size());
|
||||||
|
for (CharRange range : range_data)
|
||||||
|
insensitive_data.append(CharRange { to_ascii_lowercase(range.from), to_ascii_lowercase(range.to) });
|
||||||
|
quick_sort(insensitive_data, [](CharRange a, CharRange b) { return a.from < b.from; });
|
||||||
|
|
||||||
|
arguments.extend(insensitive_data);
|
||||||
|
arguments[insensitive_size_index] = insensitive_data.size();
|
||||||
}
|
}
|
||||||
arguments[size_index] = range_count;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
auto contains_regular_table = !table.is_empty();
|
auto contains_regular_table = !table.is_empty();
|
||||||
|
|
|
@ -66,6 +66,7 @@ public:
|
||||||
Optional<ByteString> pure_substring_search;
|
Optional<ByteString> pure_substring_search;
|
||||||
// If populated, the pattern only accepts strings that start with a character in these ranges.
|
// If populated, the pattern only accepts strings that start with a character in these ranges.
|
||||||
Vector<CharRange> starting_ranges;
|
Vector<CharRange> starting_ranges;
|
||||||
|
Vector<CharRange> starting_ranges_insensitive;
|
||||||
bool only_start_of_line = false;
|
bool only_start_of_line = false;
|
||||||
} optimization_data {};
|
} optimization_data {};
|
||||||
};
|
};
|
||||||
|
|
|
@ -736,6 +736,8 @@ TEST_CASE(ECMA262_match)
|
||||||
{ "(?!(b))\\1"sv, "a"sv, false },
|
{ "(?!(b))\\1"sv, "a"sv, false },
|
||||||
// String table merge bug: inverse map should be merged regardless of available direct mappings.
|
// String table merge bug: inverse map should be merged regardless of available direct mappings.
|
||||||
{ "((?<x>a)|(?<x>b))"sv, "aa"sv, false },
|
{ "((?<x>a)|(?<x>b))"sv, "aa"sv, false },
|
||||||
|
// Insensitive charclasses should accept upper/lowercase in pattern (lookup table should still be ordered if insensitive lookup is used), ladybird#5399.
|
||||||
|
{ "[aBc]"sv, "b"sv, true, ECMAScriptFlags::Insensitive },
|
||||||
};
|
};
|
||||||
|
|
||||||
for (auto& test : tests) {
|
for (auto& test : tests) {
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue