LibRegex: Allow duplicate named capture groups in separate alternatives

This commit is contained in:
aplefull 2025-03-01 22:46:10 +01:00 committed by Ali Mohammad Pur
commit 389a63d6bf
Notes: github-actions[bot] 2025-03-05 13:37:12 +00:00
3 changed files with 43 additions and 8 deletions

View file

@ -981,10 +981,24 @@ bool ECMA262Parser::parse_pattern(ByteCode& stack, size_t& match_length_minimum,
return parse_disjunction(stack, match_length_minimum, flags); return parse_disjunction(stack, match_length_minimum, flags);
} }
bool ECMA262Parser::has_duplicate_in_current_alternative(DeprecatedFlyString const& name)
{
auto it = m_parser_state.named_capture_groups.find(name);
if (it == m_parser_state.named_capture_groups.end())
return false;
return any_of(it->value.begin(), it->value.end(), [&](auto& group) {
return group.alternative_id == m_current_alternative_id;
});
}
bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_minimum, ParseFlags flags) bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_minimum, ParseFlags flags)
{ {
size_t total_match_length_minimum = NumericLimits<size_t>::max(); size_t total_match_length_minimum = NumericLimits<size_t>::max();
Vector<ByteCode> alternatives; Vector<ByteCode> alternatives;
TemporaryChange<size_t> alternative_id_change { m_current_alternative_id, 1 };
while (true) { while (true) {
ByteCode alternative_stack; ByteCode alternative_stack;
size_t alternative_minimum_length = 0; size_t alternative_minimum_length = 0;
@ -998,10 +1012,13 @@ bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_mini
if (!match(TokenType::Pipe)) if (!match(TokenType::Pipe))
break; break;
consume(); consume();
m_current_alternative_id += 1;
} }
Optimizer::append_alternation(stack, alternatives.span()); Optimizer::append_alternation(stack, alternatives.span());
match_length_minimum = total_match_length_minimum; match_length_minimum = total_match_length_minimum;
return true; return true;
} }
@ -1622,19 +1639,26 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
set_error(Error::InvalidNameForCaptureGroup); set_error(Error::InvalidNameForCaptureGroup);
return false; return false;
} }
auto maybe_capture_group = m_parser_state.named_capture_groups.get(name);
if (!maybe_capture_group.has_value()) { auto it = m_parser_state.named_capture_groups.find(name);
if (it == m_parser_state.named_capture_groups.end()) {
set_error(Error::InvalidNameForCaptureGroup); set_error(Error::InvalidNameForCaptureGroup);
return false; return false;
} }
auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(maybe_capture_group.value());
// Use the first occurrence of the named group for the backreference
// This follows ECMAScript behavior where \k<name> refers to the first
// group with that name in left-to-right order, regardless of alternative
auto group_index = it->value.first().group_index;
auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(group_index);
if (!maybe_length.has_value()) { if (!maybe_length.has_value()) {
set_error(Error::InvalidNameForCaptureGroup); set_error(Error::InvalidNameForCaptureGroup);
return false; return false;
} }
match_length_minimum += maybe_length.value();
stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)maybe_capture_group.value() } }); match_length_minimum += maybe_length.value();
stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)group_index } });
return true; return true;
} }
@ -2674,12 +2698,12 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
return false; return false;
} }
if (m_parser_state.named_capture_groups.contains(name)) { if (has_duplicate_in_current_alternative(name)) {
set_error(Error::DuplicateNamedCapture); set_error(Error::DuplicateNamedCapture);
return false; return false;
} }
m_parser_state.named_capture_groups.set(name, group_index); m_parser_state.named_capture_groups.ensure(name).append({ group_index, m_current_alternative_id });
ByteCode capture_group_bytecode; ByteCode capture_group_bytecode;
size_t length = 0; size_t length = 0;

View file

@ -44,6 +44,11 @@ template<>
struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> { struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
}; };
struct NamedCaptureGroup {
size_t group_index;
size_t alternative_id;
};
class Parser { class Parser {
public: public:
struct Result { struct Result {
@ -111,7 +116,7 @@ protected:
size_t repetition_mark_count { 0 }; size_t repetition_mark_count { 0 };
AllOptions regex_options; AllOptions regex_options;
HashMap<size_t, size_t> capture_group_minimum_lengths; HashMap<size_t, size_t> capture_group_minimum_lengths;
HashMap<DeprecatedFlyString, size_t> named_capture_groups; HashMap<DeprecatedFlyString, Vector<NamedCaptureGroup>> named_capture_groups;
explicit ParserState(Lexer& lexer) explicit ParserState(Lexer& lexer)
: lexer(lexer) : lexer(lexer)
@ -276,6 +281,8 @@ private:
bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false. bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false.
Optional<u8> parse_legacy_octal_escape(); Optional<u8> parse_legacy_octal_escape();
bool has_duplicate_in_current_alternative(DeprecatedFlyString const& name);
size_t ensure_total_number_of_capturing_parenthesis(); size_t ensure_total_number_of_capturing_parenthesis();
void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); } void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); }
@ -298,6 +305,9 @@ private:
// Most patterns should have no need to ever populate this field. // Most patterns should have no need to ever populate this field.
Optional<size_t> m_total_number_of_capturing_parenthesis; Optional<size_t> m_total_number_of_capturing_parenthesis;
// We need to keep track of the current alternative's named capture groups, so we can check for duplicates.
size_t m_current_alternative_id { 0 };
// Keep the Annex B. behavior behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag. // Keep the Annex B. behavior behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
bool m_should_use_browser_extended_grammar { false }; bool m_should_use_browser_extended_grammar { false };

View file

@ -598,6 +598,7 @@ TEST_CASE(ECMA262_parse)
{ "(?<a>a)(?<a>b)"sv, regex::Error::DuplicateNamedCapture }, { "(?<a>a)(?<a>b)"sv, regex::Error::DuplicateNamedCapture },
{ "(?<a>a)(?<b>b)(?<a>c)"sv, regex::Error::DuplicateNamedCapture }, { "(?<a>a)(?<b>b)(?<a>c)"sv, regex::Error::DuplicateNamedCapture },
{ "(?<a>(?<a>a))"sv, regex::Error::DuplicateNamedCapture }, { "(?<a>(?<a>a))"sv, regex::Error::DuplicateNamedCapture },
{ "(?:(?<x>a)|(?<y>a)(?<x>b))(?:(?<z>c)|(?<z>d))"sv }, // Duplicate named capturing groups in separate alternatives should parse correctly
{ "(?<1a>a)"sv, regex::Error::InvalidNameForCaptureGroup }, { "(?<1a>a)"sv, regex::Error::InvalidNameForCaptureGroup },
{ "(?<\\a>a)"sv, regex::Error::InvalidNameForCaptureGroup }, { "(?<\\a>a)"sv, regex::Error::InvalidNameForCaptureGroup },
{ "(?<\ta>a)"sv, regex::Error::InvalidNameForCaptureGroup }, { "(?<\ta>a)"sv, regex::Error::InvalidNameForCaptureGroup },