LibRegex: Flatten capture group list in MatchState

This makes copying the capture group COWVector significantly cheaper,
as we no longer have to run any constructors for it - just memcpy.
This commit is contained in:
Ali Mohammad Pur 2025-04-15 15:31:08 +02:00
parent 141f6cb392
commit 0f675d5992
14 changed files with 98 additions and 87 deletions

View file

@ -127,6 +127,13 @@ public:
return m_detail->m_members[index];
}
Span<T const> span() const { return m_detail->m_members; }
Span<T> mutable_span()
{
copy();
return m_detail->m_members;
}
size_t capacity() const
{
return m_detail->m_members.capacity();

View file

@ -35,7 +35,6 @@ public:
static constexpr regex::RegexOptions<ECMAScriptFlags> default_flags {
(regex::ECMAScriptFlags)regex::AllFlags::SingleMatch
| (regex::ECMAScriptFlags)regex::AllFlags::Global
| (regex::ECMAScriptFlags)regex::AllFlags::SkipTrimEmptyMatches
| regex::ECMAScriptFlags::BrowserExtended
};

View file

@ -294,7 +294,7 @@ static ThrowCompletionOr<Value> regexp_builtin_exec(VM& vm, RegExpObject& regexp
// 33. For each integer i such that i ≥ 1 and i ≤ n, in ascending order, do
for (size_t i = 1; i <= result.n_capture_groups; ++i) {
// a. Let captureI be ith element of r's captures List.
auto& capture = result.capture_group_matches[0][i];
auto& capture = result.capture_group_matches[0][i - 1];
Value captured_value;

View file

@ -341,40 +341,29 @@ ALWAYS_INLINE ExecutionResult OpCode_CheckEnd::execute(MatchInput const& input,
ALWAYS_INLINE ExecutionResult OpCode_ClearCaptureGroup::execute(MatchInput const& input, MatchState& state) const
{
if (input.match_index < state.capture_group_matches.size()) {
auto& group = state.capture_group_matches.mutable_at(input.match_index);
auto group_id = id();
if (group_id >= group.size())
group.resize(group_id + 1);
group[group_id].reset();
if (input.match_index < state.capture_group_matches_size()) {
auto group = state.mutable_capture_group_matches(input.match_index);
group[id() - 1].reset();
}
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_SaveLeftCaptureGroup::execute(MatchInput const& input, MatchState& state) const
{
if (input.match_index >= state.capture_group_matches.size()) {
state.capture_group_matches.ensure_capacity(input.match_index);
auto capacity = state.capture_group_matches.capacity();
for (size_t i = state.capture_group_matches.size(); i <= capacity; ++i)
state.capture_group_matches.empend();
if (input.match_index >= state.capture_group_matches_size()) {
state.flat_capture_group_matches.ensure_capacity((input.match_index + 1) * state.capture_group_count);
for (size_t i = state.capture_group_matches_size(); i <= input.match_index; ++i)
for (size_t j = 0; j < state.capture_group_count; ++j)
state.flat_capture_group_matches.append({});
}
if (id() >= state.capture_group_matches.at(input.match_index).size()) {
state.capture_group_matches.mutable_at(input.match_index).ensure_capacity(id());
auto capacity = state.capture_group_matches.at(input.match_index).capacity();
for (size_t i = state.capture_group_matches.at(input.match_index).size(); i <= capacity; ++i)
state.capture_group_matches.mutable_at(input.match_index).empend();
}
state.capture_group_matches.mutable_at(input.match_index).at(id()).left_column = state.string_position;
state.mutable_capture_group_matches(input.match_index).at(id() - 1).left_column = state.string_position;
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(MatchInput const& input, MatchState& state) const
{
auto& match = state.capture_group_matches.mutable_at(input.match_index).at(id());
auto& match = state.capture_group_matches(input.match_index).at(id() - 1);
auto start_position = match.left_column;
if (state.string_position < start_position) {
dbgln("Right capture group {} is before left capture group {}!", state.string_position, start_position);
@ -388,14 +377,14 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(MatchInput c
VERIFY(start_position + length <= input.view.length());
match = { input.view.substring_view(start_position, length), input.line, start_position, input.global_offset + start_position };
state.mutable_capture_group_matches(input.match_index).at(id() - 1) = { input.view.substring_view(start_position, length), input.line, start_position, input.global_offset + start_position };
return ExecutionResult::Continue;
}
ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(MatchInput const& input, MatchState& state) const
{
auto& match = state.capture_group_matches.mutable_at(input.match_index).at(id());
auto& match = state.capture_group_matches(input.match_index).at(id() - 1);
auto start_position = match.left_column;
if (state.string_position < start_position)
return ExecutionResult::Failed_ExecuteLowPrioForks;
@ -409,7 +398,7 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(MatchIn
auto view = input.view.substring_view(start_position, length);
match = { view, name_string_table_index(), input.line, start_position, input.global_offset + start_position };
state.mutable_capture_group_matches(input.match_index).at(id() - 1) = { view, name_string_table_index(), input.line, start_position, input.global_offset + start_position };
return ExecutionResult::Continue;
}
@ -584,11 +573,11 @@ ALWAYS_INLINE ExecutionResult OpCode_Compare::execute(MatchInput const& input, M
break;
}
case CharacterCompareType::Reference: {
auto reference_number = (size_t)m_bytecode->at(offset++);
if (input.match_index >= state.capture_group_matches.size())
auto reference_number = ((size_t)m_bytecode->at(offset++)) - 1;
if (input.match_index >= state.capture_group_matches_size())
return ExecutionResult::Failed_ExecuteLowPrioForks;
auto& groups = state.capture_group_matches.at(input.match_index);
auto groups = state.capture_group_matches(input.match_index);
if (groups.size() <= reference_number)
return ExecutionResult::Failed_ExecuteLowPrioForks;
@ -988,8 +977,8 @@ Vector<ByteString> OpCode_Compare::variable_arguments_to_byte_string(Optional<Ma
auto ref = m_bytecode->at(offset++);
result.empend(ByteString::formatted(" number={}", ref));
if (input.has_value()) {
if (state().capture_group_matches.size() > input->match_index) {
auto& match = state().capture_group_matches[input->match_index];
if (state().capture_group_matches_size() > input->match_index) {
auto match = state().capture_group_matches(input->match_index);
if (match.size() > ref) {
auto& group = match[ref];
result.empend(ByteString::formatted(" left={}", group.left_column));
@ -999,7 +988,7 @@ Vector<ByteString> OpCode_Compare::variable_arguments_to_byte_string(Optional<Ma
result.empend(ByteString::formatted(" (invalid ref, max={})", match.size() - 1));
}
} else {
result.empend(ByteString::formatted(" (invalid index {}, max={})", input->match_index, state().capture_group_matches.size() - 1));
result.empend(ByteString::formatted(" (invalid index {}, max={})", input->match_index, state().capture_group_matches_size() - 1));
}
}
} else if (compare_type == CharacterCompareType::String) {

View file

@ -39,7 +39,7 @@ public:
void print_bytecode(ByteCode const& bytecode) const
{
MatchState state;
auto state = MatchState::only_for_enumeration();
for (;;) {
auto& opcode = bytecode.get_opcode(state);
print_opcode("PrintBytecode", opcode, state);

View file

@ -43,12 +43,11 @@ enum __RegexAllFlags {
__Regex_SingleLine = __Regex_Global << 10, // Dot matches newline characters
__Regex_Sticky = __Regex_Global << 11, // Force the pattern to only match consecutive matches from where the previous match ended.
__Regex_Multiline = __Regex_Global << 12, // Handle newline characters. Match each line, one by one.
__Regex_SkipTrimEmptyMatches = __Regex_Global << 13, // Do not remove empty capture group results.
__Regex_SingleMatch = __Regex_Global << 14, // Stop after acquiring a single match.
__Regex_UnicodeSets = __Regex_Global << 15, // ECMA262 Parser specific: Allow set operations in char classes.
__Regex_Internal_Stateful = __Regex_Global << 16, // Internal flag; enables stateful matches.
__Regex_Internal_BrowserExtended = __Regex_Global << 17, // Internal flag; enable browser-specific ECMA262 extensions.
__Regex_Internal_ConsiderNewline = __Regex_Global << 18, // Internal flag; allow matchers to consider newlines as line separators.
__Regex_Internal_ECMA262DotSemantics = __Regex_Global << 19, // Internal flag; use ECMA262 semantics for dot ('.') - disallow CR/LF/LS/PS instead of just CR.
__Regex_SingleMatch = __Regex_Global << 13, // Stop after acquiring a single match.
__Regex_UnicodeSets = __Regex_Global << 14, // ECMA262 Parser specific: Allow set operations in char classes.
__Regex_Internal_Stateful = __Regex_Global << 15, // Internal flag; enables stateful matches.
__Regex_Internal_BrowserExtended = __Regex_Global << 16, // Internal flag; enable browser-specific ECMA262 extensions.
__Regex_Internal_ConsiderNewline = __Regex_Global << 17, // Internal flag; allow matchers to consider newlines as line separators.
__Regex_Internal_ECMA262DotSemantics = __Regex_Global << 18, // Internal flag; use ECMA262 semantics for dot ('.') - disallow CR/LF/LS/PS instead of just CR.
__Regex_Last = __Regex_Internal_ECMA262DotSemantics,
};

View file

@ -369,6 +369,7 @@ struct MatchInput {
};
struct MatchState {
size_t capture_group_count;
size_t string_position_before_match { 0 };
size_t string_position { 0 };
size_t string_position_in_code_units { 0 };
@ -377,10 +378,38 @@ struct MatchState {
size_t forks_since_last_save { 0 };
Optional<size_t> initiating_fork;
COWVector<Match> matches;
COWVector<Vector<Match>> capture_group_matches;
COWVector<Match> flat_capture_group_matches; // Vector<Vector<Match>> indexed by match index, then by capture group id; flattened for performance
COWVector<u64> repetition_marks;
Vector<u64, 64> checkpoints;
explicit MatchState(size_t capture_group_count)
: capture_group_count(capture_group_count)
{
}
MatchState(MatchState const&) = default;
MatchState(MatchState&&) = default;
MatchState& operator=(MatchState const&) = default;
MatchState& operator=(MatchState&&) = default;
static MatchState only_for_enumeration() { return MatchState { 0 }; }
size_t capture_group_matches_size() const
{
return flat_capture_group_matches.size() / capture_group_count;
}
Span<Match const> capture_group_matches(size_t match_index) const
{
return flat_capture_group_matches.span().slice(match_index * capture_group_count, capture_group_count);
}
Span<Match> mutable_capture_group_matches(size_t match_index)
{
return flat_capture_group_matches.mutable_span().slice(match_index * capture_group_count, capture_group_count);
}
// For size_t in {0..100}, ips in {0..500} and repetitions in {0..30}, there are zero collisions.
// For the full range, zero collisions were found in 8 million random samples.
u64 u64_hash() const

View file

@ -164,7 +164,7 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
size_t match_count { 0 };
MatchInput input;
MatchState state;
MatchState state { m_pattern->parser_result.capture_groups_count };
size_t operations = 0;
input.regex_options = m_regex_options | regex_options.value_or({}).value();
@ -189,20 +189,6 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
}
}
if (c_match_preallocation_count) {
state.matches.ensure_capacity(c_match_preallocation_count);
state.capture_group_matches.ensure_capacity(c_match_preallocation_count);
auto& capture_groups_count = m_pattern->parser_result.capture_groups_count;
for (size_t j = 0; j < c_match_preallocation_count; ++j) {
state.matches.empend();
state.capture_group_matches.empend();
state.capture_group_matches.mutable_at(j).ensure_capacity(capture_groups_count);
for (size_t k = 0; k < capture_groups_count; ++k)
state.capture_group_matches.mutable_at(j).unchecked_append({});
}
}
auto append_match = [](auto& input, auto& state, auto& start_position) {
if (state.matches.size() == input.match_index)
state.matches.empend();
@ -343,29 +329,34 @@ RegexResult Matcher<Parser>::match(Vector<RegexStringView> const& views, Optiona
break;
}
auto flat_capture_group_matches = move(state.flat_capture_group_matches).release();
if (flat_capture_group_matches.size() < state.capture_group_count * match_count) {
flat_capture_group_matches.ensure_capacity(match_count * state.capture_group_count);
for (size_t i = flat_capture_group_matches.size(); i < match_count * state.capture_group_count; ++i)
flat_capture_group_matches.empend();
}
Vector<Span<Match>> capture_group_matches;
for (size_t i = 0; i < match_count; ++i) {
auto span = flat_capture_group_matches.span().slice(state.capture_group_count * i, state.capture_group_count);
capture_group_matches.append(span);
}
RegexResult result {
match_count != 0,
match_count,
move(state.matches).release(),
move(state.capture_group_matches).release(),
move(flat_capture_group_matches),
move(capture_group_matches),
operations,
m_pattern->parser_result.capture_groups_count,
m_pattern->parser_result.named_capture_groups_count,
};
if (match_count) {
// Make sure there are as many capture matches as there are actual matches.
if (result.capture_group_matches.size() < match_count)
result.capture_group_matches.resize(match_count);
for (auto& matches : result.capture_group_matches)
matches.resize(m_pattern->parser_result.capture_groups_count + 1);
if (!input.regex_options.has_flag_set(AllFlags::SkipTrimEmptyMatches)) {
for (auto& matches : result.capture_group_matches)
matches.remove_all_matching([](auto& match) { return match.view.is_null(); });
}
} else {
if (match_count > 0)
VERIFY(result.capture_group_matches.size() >= match_count);
else
result.capture_group_matches.clear_with_capacity();
}
return result;
}

View file

@ -31,13 +31,13 @@ struct Block {
}
static constexpr size_t const c_max_recursion = 5000;
static constexpr size_t const c_match_preallocation_count = 0;
struct RegexResult final {
bool success { false };
size_t count { 0 };
Vector<Match> matches;
Vector<Vector<Match>> capture_group_matches;
Vector<Match> flat_capture_group_matches;
Vector<Span<Match>> capture_group_matches;
size_t n_operations { 0 };
size_t n_capture_groups { 0 };
size_t n_named_capture_groups { 0 };

View file

@ -37,7 +37,7 @@ void Regex<Parser>::run_optimization_passes()
attempt_rewrite_loops_as_atomic_groups(blocks);
// FIXME: "There are a few more conditions this can be true in (e.g. within an arbitrarily nested capture group)"
MatchState state;
auto state = MatchState::only_for_enumeration();
auto& opcode = parser_result.bytecode.get_opcode(state);
if (opcode.opcode_id() == OpCodeId::CheckBegin)
parser_result.optimization_data.only_start_of_line = true;
@ -53,7 +53,7 @@ typename Regex<Parser>::BasicBlockList Regex<Parser>::split_basic_blocks(ByteCod
auto bytecode_size = bytecode.size();
MatchState state;
auto state = MatchState::only_for_enumeration();
state.instruction_position = 0;
auto check_jump = [&]<typename T>(OpCode const& opcode) {
auto& op = static_cast<T const&>(opcode);
@ -512,7 +512,7 @@ enum class AtomicRewritePreconditionResult {
static AtomicRewritePreconditionResult block_satisfies_atomic_rewrite_precondition(ByteCode const& bytecode, Block repeated_block, Block following_block, auto const& all_blocks)
{
Vector<Vector<CompareTypeAndValuePair>> repeated_values;
MatchState state;
auto state = MatchState::only_for_enumeration();
auto has_seen_actionable_opcode = false;
for (state.instruction_position = repeated_block.start; state.instruction_position < repeated_block.end;) {
auto& opcode = bytecode.get_opcode(state);
@ -680,7 +680,7 @@ bool Regex<Parser>::attempt_rewrite_entire_match_as_substring_search(BasicBlockL
// We have a single basic block, let's see if it's a series of character or string compares.
StringBuilder final_string;
MatchState state;
auto state = MatchState::only_for_enumeration();
while (state.instruction_position < bytecode.size()) {
auto& opcode = bytecode.get_opcode(state);
switch (opcode.opcode_id()) {
@ -796,7 +796,7 @@ void Regex<Parser>::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&
Optional<Block> fork_fallback_block;
if (i + 1 < basic_blocks.size())
fork_fallback_block = basic_blocks[i + 1];
MatchState state;
auto state = MatchState::only_for_enumeration();
// Check if the last instruction in this block is a jump to the block itself:
{
state.instruction_position = forking_block.end;
@ -913,7 +913,7 @@ void Regex<Parser>::attempt_rewrite_loops_as_atomic_groups(BasicBlockList const&
}
if (!needed_patches.is_empty()) {
MatchState state;
auto state = MatchState::only_for_enumeration();
auto bytecode_size = bytecode.size();
state.instruction_position = 0;
struct Patch {
@ -1039,7 +1039,7 @@ void Optimizer::append_alternation(ByteCode& target, Span<ByteCode> alternatives
auto has_any_backwards_jump = false;
MatchState state;
auto state = MatchState::only_for_enumeration();
for (size_t i = 0; i < alternatives.size(); ++i) {
auto& alternative = alternatives[i];
@ -1144,7 +1144,7 @@ void Optimizer::append_alternation(ByteCode& target, Span<ByteCode> alternatives
node.metadata_value().size(),
node.metadata_value().size() == 1 ? "" : "s");
MatchState state;
auto state = MatchState::only_for_enumeration();
state.instruction_position = node.metadata_value().first().instruction_position;
auto& opcode = alternatives[node.metadata_value().first().alternative_index].get_opcode(state);
insn = ByteString::formatted("{} {}", opcode.to_byte_string(), opcode.arguments_string());

View file

@ -28,7 +28,6 @@ enum class AllFlags {
SingleLine = __Regex_SingleLine, // Dot matches newline characters
Sticky = __Regex_Sticky, // Force the pattern to only match consecutive matches from where the previous match ended.
Multiline = __Regex_Multiline, // Handle newline characters. Match each line, one by one.
SkipTrimEmptyMatches = __Regex_SkipTrimEmptyMatches, // Do not remove empty capture group results.
SingleMatch = __Regex_SingleMatch, // Stop after acquiring a single match.
UnicodeSets = __Regex_UnicodeSets, // Only for ECMA262, Allow set operations in character classes.
Internal_Stateful = __Regex_Internal_Stateful, // Make global matches match one result at a time, and further match() calls on the same instance continue where the previous one left off.
@ -49,7 +48,6 @@ enum class PosixFlags : FlagsUnderlyingType {
MatchNotBeginOfLine = (FlagsUnderlyingType)AllFlags::MatchNotBeginOfLine,
MatchNotEndOfLine = (FlagsUnderlyingType)AllFlags::MatchNotEndOfLine,
SkipSubExprResults = (FlagsUnderlyingType)AllFlags::SkipSubExprResults,
SkipTrimEmptyMatches = (FlagsUnderlyingType)AllFlags::SkipTrimEmptyMatches,
Multiline = (FlagsUnderlyingType)AllFlags::Multiline,
SingleMatch = (FlagsUnderlyingType)AllFlags::SingleMatch,
};

View file

@ -857,7 +857,7 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, si
auto current_capture_group = m_parser_state.capture_groups_count;
if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
bytecode.insert_bytecode_group_capture_left(current_capture_group);
bytecode.insert_bytecode_group_capture_left(current_capture_group + 1);
m_parser_state.capture_groups_count++;
}
@ -888,9 +888,9 @@ ALWAYS_INLINE bool PosixExtendedParser::parse_sub_expression(ByteCode& stack, si
if (!(m_parser_state.regex_options & AllFlags::SkipSubExprResults || prevent_capture_group)) {
if (capture_group_name.has_value())
bytecode.insert_bytecode_group_capture_right(current_capture_group, capture_group_name.value());
bytecode.insert_bytecode_group_capture_right(current_capture_group + 1, capture_group_name.value());
else
bytecode.insert_bytecode_group_capture_right(current_capture_group);
bytecode.insert_bytecode_group_capture_right(current_capture_group + 1);
}
should_parse_repetition_symbol = true;
break;

View file

@ -228,7 +228,6 @@ PatternErrorOr<Component> Component::compile(Utf8View const& input, PatternParse
auto flags = regex::RegexOptions<ECMAScriptFlags> {
(regex::ECMAScriptFlags)regex::AllFlags::SingleMatch
| (regex::ECMAScriptFlags)regex::AllFlags::Global
| (regex::ECMAScriptFlags)regex::AllFlags::SkipTrimEmptyMatches
| regex::ECMAScriptFlags::BrowserExtended
};
@ -288,7 +287,7 @@ Component::Result Component::create_match_result(String const& input, regex::Reg
// 4. Let index be 1.
// 5. While index is less than Get(execResult, "length"):
for (size_t index = 1; index <= exec_result.n_capture_groups; ++index) {
auto const& capture = exec_result.capture_group_matches[0][index];
auto const& capture = exec_result.capture_group_matches[0][index - 1];
// 1. Let name be components group name list[index 1].
auto name = group_name_list[index - 1];

View file

@ -373,7 +373,7 @@ TEST_CASE(ini_file_entries)
}
EXPECT_EQ(result.matches.at(0).view, "[Window]");
EXPECT_EQ(result.capture_group_matches.at(0).at(0).view, "Window");
EXPECT_EQ(result.capture_group_matches.at(0).at(1).view, "Window");
EXPECT_EQ(result.matches.at(1).view, "Opacity=255");
EXPECT_EQ(result.matches.at(1).line, 1u);
EXPECT_EQ(result.matches.at(1).column, 0u);