diff --git a/Libraries/LibRegex/RegexParser.cpp b/Libraries/LibRegex/RegexParser.cpp index 16cc9ed8e10..b37ff67357a 100644 --- a/Libraries/LibRegex/RegexParser.cpp +++ b/Libraries/LibRegex/RegexParser.cpp @@ -981,10 +981,24 @@ bool ECMA262Parser::parse_pattern(ByteCode& stack, size_t& match_length_minimum, return parse_disjunction(stack, match_length_minimum, flags); } +bool ECMA262Parser::has_duplicate_in_current_alternative(DeprecatedFlyString const& name) +{ + auto it = m_parser_state.named_capture_groups.find(name); + if (it == m_parser_state.named_capture_groups.end()) + return false; + + return any_of(it->value.begin(), it->value.end(), [&](auto& group) { + return group.alternative_id == m_current_alternative_id; + }); +} + bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_minimum, ParseFlags flags) { size_t total_match_length_minimum = NumericLimits::max(); Vector alternatives; + + TemporaryChange alternative_id_change { m_current_alternative_id, 1 }; + while (true) { ByteCode alternative_stack; size_t alternative_minimum_length = 0; @@ -998,10 +1012,13 @@ bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_mini if (!match(TokenType::Pipe)) break; consume(); + + m_current_alternative_id += 1; } Optimizer::append_alternation(stack, alternatives.span()); match_length_minimum = total_match_length_minimum; + return true; } @@ -1622,19 +1639,26 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini set_error(Error::InvalidNameForCaptureGroup); return false; } - auto maybe_capture_group = m_parser_state.named_capture_groups.get(name); - if (!maybe_capture_group.has_value()) { + + auto it = m_parser_state.named_capture_groups.find(name); + if (it == m_parser_state.named_capture_groups.end()) { set_error(Error::InvalidNameForCaptureGroup); return false; } - auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(maybe_capture_group.value()); + + // Use the first occurrence of the named group for the backreference + // This follows ECMAScript behavior where \k refers to the first + // group with that name in left-to-right order, regardless of alternative + auto group_index = it->value.first().group_index; + + auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(group_index); if (!maybe_length.has_value()) { set_error(Error::InvalidNameForCaptureGroup); return false; } - match_length_minimum += maybe_length.value(); - stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)maybe_capture_group.value() } }); + match_length_minimum += maybe_length.value(); + stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)group_index } }); return true; } @@ -2674,12 +2698,12 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi return false; } - if (m_parser_state.named_capture_groups.contains(name)) { + if (has_duplicate_in_current_alternative(name)) { set_error(Error::DuplicateNamedCapture); return false; } - m_parser_state.named_capture_groups.set(name, group_index); + m_parser_state.named_capture_groups.ensure(name).append({ group_index, m_current_alternative_id }); ByteCode capture_group_bytecode; size_t length = 0; diff --git a/Libraries/LibRegex/RegexParser.h b/Libraries/LibRegex/RegexParser.h index 6ac48c00531..2118e21d022 100644 --- a/Libraries/LibRegex/RegexParser.h +++ b/Libraries/LibRegex/RegexParser.h @@ -44,6 +44,11 @@ template<> struct ParserTraits : public GenericParserTraits { }; +struct NamedCaptureGroup { + size_t group_index; + size_t alternative_id; +}; + class Parser { public: struct Result { @@ -111,7 +116,7 @@ protected: size_t repetition_mark_count { 0 }; AllOptions regex_options; HashMap capture_group_minimum_lengths; - HashMap named_capture_groups; + HashMap> named_capture_groups; explicit ParserState(Lexer& lexer) : lexer(lexer) @@ -276,6 +281,8 @@ private: bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false. Optional parse_legacy_octal_escape(); + bool has_duplicate_in_current_alternative(DeprecatedFlyString const& name); + size_t ensure_total_number_of_capturing_parenthesis(); void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); } @@ -298,6 +305,9 @@ private: // Most patterns should have no need to ever populate this field. Optional m_total_number_of_capturing_parenthesis; + // We need to keep track of the current alternative's named capture groups, so we can check for duplicates. + size_t m_current_alternative_id { 0 }; + // Keep the Annex B. behavior behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag. bool m_should_use_browser_extended_grammar { false }; diff --git a/Tests/LibRegex/Regex.cpp b/Tests/LibRegex/Regex.cpp index 0a69e5f522b..1a0099b13cf 100644 --- a/Tests/LibRegex/Regex.cpp +++ b/Tests/LibRegex/Regex.cpp @@ -598,6 +598,7 @@ TEST_CASE(ECMA262_parse) { "(?a)(?b)"sv, regex::Error::DuplicateNamedCapture }, { "(?a)(?b)(?c)"sv, regex::Error::DuplicateNamedCapture }, { "(?(?a))"sv, regex::Error::DuplicateNamedCapture }, + { "(?:(?a)|(?a)(?b))(?:(?c)|(?d))"sv }, // Duplicate named capturing groups in separate alternatives should parse correctly { "(?<1a>a)"sv, regex::Error::InvalidNameForCaptureGroup }, { "(?<\\a>a)"sv, regex::Error::InvalidNameForCaptureGroup }, { "(?<\ta>a)"sv, regex::Error::InvalidNameForCaptureGroup },