LibRegex: Fix capture groups in quantified alternations
Some checks are pending
CI / macOS, arm64, Sanitizer, Clang (push) Waiting to run
CI / Linux, x86_64, Fuzzers, Clang (push) Waiting to run
CI / Linux, x86_64, Sanitizer, GNU (push) Waiting to run
CI / Linux, x86_64, Sanitizer, Clang (push) Waiting to run
Package the js repl as a binary artifact / Linux, arm64 (push) Waiting to run
Run test262 and test-wasm / run_and_update_results (push) Waiting to run
Package the js repl as a binary artifact / macOS, arm64 (push) Waiting to run
Package the js repl as a binary artifact / Linux, x86_64 (push) Waiting to run
Lint Code / lint (push) Waiting to run
Label PRs with merge conflicts / auto-labeler (push) Waiting to run
Push notes / build (push) Waiting to run

This prevents empty matches from overwriting non-empty captures in
quantified alternations. Fixes patterns like (a|a?)+ where the optional
branch would incorrectly overwrite meaningful captures with empty
strings.
This commit is contained in:
aplefull 2025-07-22 18:07:31 +02:00 committed by Ali Mohammad Pur
commit e2f8f5a350
Notes: github-actions[bot] 2025-07-24 11:20:26 +00:00
2 changed files with 48 additions and 1 deletions

View file

@ -389,7 +389,21 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightCaptureGroup::execute(MatchInput c
VERIFY(start_position + length <= input.view.length()); VERIFY(start_position + length <= input.view.length());
state.mutable_capture_group_matches(input.match_index).at(id() - 1) = { input.view.substring_view(start_position, length), input.line, start_position, input.global_offset + start_position }; auto captured_text = input.view.substring_view(start_position, length);
// NOTE: Don't overwrite existing capture with empty match at the same position. The ECMA-262 RepeatMatcher
// continuation chain effectively preserves captures when an empty match occurs at the position where the
// existing capture ended.
// See: https://tc39.es/ecma262/#step-repeatmatcher-done
auto& existing_capture = state.mutable_capture_group_matches(input.match_index).at(id() - 1);
if (length == 0 && !existing_capture.view.is_null() && existing_capture.view.length() > 0) {
auto existing_end_position = existing_capture.global_offset - input.global_offset + existing_capture.view.length();
if (existing_end_position == state.string_position) {
return ExecutionResult::Continue;
}
}
state.mutable_capture_group_matches(input.match_index).at(id() - 1) = { captured_text, input.line, start_position, input.global_offset + start_position };
return ExecutionResult::Continue; return ExecutionResult::Continue;
} }
@ -410,6 +424,16 @@ ALWAYS_INLINE ExecutionResult OpCode_SaveRightNamedCaptureGroup::execute(MatchIn
auto view = input.view.substring_view(start_position, length); auto view = input.view.substring_view(start_position, length);
// Same logic as in SaveRightCaptureGroup above.
// https://tc39.es/ecma262/#step-repeatmatcher-done
auto& existing_capture = state.mutable_capture_group_matches(input.match_index).at(id() - 1);
if (length == 0 && !existing_capture.view.is_null() && existing_capture.view.length() > 0) {
auto existing_end_position = existing_capture.global_offset - input.global_offset + existing_capture.view.length();
if (existing_end_position == state.string_position) {
return ExecutionResult::Continue;
}
}
state.mutable_capture_group_matches(input.match_index).at(id() - 1) = { view, name_string_table_index(), input.line, start_position, input.global_offset + start_position }; state.mutable_capture_group_matches(input.match_index).at(id() - 1) = { view, name_string_table_index(), input.line, start_position, input.global_offset + start_position };
return ExecutionResult::Continue; return ExecutionResult::Continue;

View file

@ -1330,6 +1330,29 @@ TEST_CASE(optimizer_repeat_offset)
} }
} }
TEST_CASE(quantified_alternation_capture_groups)
{
{
// Ensure that (a|a?)+ captures the last meaningful match, not empty string
Regex<ECMA262> re("^(a|a?)+$");
auto result = re.match("a"sv);
EXPECT_EQ(result.success, true);
EXPECT_EQ(result.matches.size(), 1u);
EXPECT_EQ(result.matches.first().view.to_byte_string(), "a"sv);
EXPECT_EQ(result.capture_group_matches.first()[0].view.to_byte_string(), "a"sv);
}
{
Regex<ECMA262> re("^(a|a?)+$");
auto result = re.match("aa"sv);
EXPECT_EQ(result.success, true);
EXPECT_EQ(result.matches.size(), 1u);
EXPECT_EQ(result.matches.first().view.to_byte_string(), "aa"sv);
EXPECT_EQ(result.capture_group_matches.first()[0].view.to_byte_string(), "a"sv);
}
}
TEST_CASE(zero_width_backreference) TEST_CASE(zero_width_backreference)
{ {
{ {