LibRegex: Allow duplicate named capture groups in separate alternatives

Author: https://github.com/aplefull Commit: 389a63d6bf Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/3815 Reviewed-by: https://github.com/alimpfard
2025-07-29 12:19:54 +00:00 · 2025-03-01 22:46:10 +01:00 · 2025-03-01 22:46:10 +01:00 · 389a63d6bf · 2025-03-05 13:37:12 +00:00
commit 389a63d6bf
parent 7df9e00650
3 changed files with 43 additions and 8 deletions
--- a/Libraries/LibRegex/RegexParser.cpp
+++ b/Libraries/LibRegex/RegexParser.cpp
@ -981,10 +981,24 @@ bool ECMA262Parser::parse_pattern(ByteCode& stack, size_t& match_length_minimum,
    return parse_disjunction(stack, match_length_minimum, flags);
 }
 bool ECMA262Parser::has_duplicate_in_current_alternative(DeprecatedFlyString const& name)
 {
    auto it = m_parser_state.named_capture_groups.find(name);
    if (it == m_parser_state.named_capture_groups.end())
        return false;
    return any_of(it->value.begin(), it->value.end(), [&](auto& group) {
        return group.alternative_id == m_current_alternative_id;
    });
 }
 bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_minimum, ParseFlags flags)
 {
    size_t total_match_length_minimum = NumericLimits<size_t>::max();
    Vector<ByteCode> alternatives;
    TemporaryChange<size_t> alternative_id_change { m_current_alternative_id, 1 };
    while (true) {
        ByteCode alternative_stack;
        size_t alternative_minimum_length = 0;
@ -998,10 +1012,13 @@ bool ECMA262Parser::parse_disjunction(ByteCode& stack, size_t& match_length_mini
        if (!match(TokenType::Pipe))
            break;
        consume();
        m_current_alternative_id += 1;
    }
    Optimizer::append_alternation(stack, alternatives.span());
    match_length_minimum = total_match_length_minimum;
    return true;
 }
@ -1622,19 +1639,26 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini
            set_error(Error::InvalidNameForCaptureGroup);
            return false;
        }
-        auto maybe_capture_group = m_parser_state.named_capture_groups.get(name);
+
-        if (!maybe_capture_group.has_value()) {
+        auto it = m_parser_state.named_capture_groups.find(name);
        if (it == m_parser_state.named_capture_groups.end()) {
            set_error(Error::InvalidNameForCaptureGroup);
            return false;
        }
-        auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(maybe_capture_group.value());
+
        // Use the first occurrence of the named group for the backreference
        // This follows ECMAScript behavior where \k<name> refers to the first
        // group with that name in left-to-right order, regardless of alternative
        auto group_index = it->value.first().group_index;
        auto maybe_length = m_parser_state.capture_group_minimum_lengths.get(group_index);
        if (!maybe_length.has_value()) {
            set_error(Error::InvalidNameForCaptureGroup);
            return false;
        }
        match_length_minimum += maybe_length.value();
-        stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)maybe_capture_group.value() } });
+        match_length_minimum += maybe_length.value();
        stack.insert_bytecode_compare_values({ { CharacterCompareType::Reference, (ByteCodeValueType)group_index } });
        return true;
    }
@ -2674,12 +2698,12 @@ bool ECMA262Parser::parse_capture_group(ByteCode& stack, size_t& match_length_mi
                return false;
            }
-            if (m_parser_state.named_capture_groups.contains(name)) {
+            if (has_duplicate_in_current_alternative(name)) {
                set_error(Error::DuplicateNamedCapture);
                return false;
            }
-            m_parser_state.named_capture_groups.set(name, group_index);
+            m_parser_state.named_capture_groups.ensure(name).append({ group_index, m_current_alternative_id });
            ByteCode capture_group_bytecode;
            size_t length = 0;
--- a/Libraries/LibRegex/RegexParser.h
+++ b/Libraries/LibRegex/RegexParser.h
@ -44,6 +44,11 @@ template<>
 struct ParserTraits<ECMA262Parser> : public GenericParserTraits<ECMAScriptOptions> {
 };
 struct NamedCaptureGroup {
    size_t group_index;
    size_t alternative_id;
 };
 class Parser {
 public:
    struct Result {
@ -111,7 +116,7 @@ protected:
        size_t repetition_mark_count { 0 };
        AllOptions regex_options;
        HashMap<size_t, size_t> capture_group_minimum_lengths;
-        HashMap<DeprecatedFlyString, size_t> named_capture_groups;
+        HashMap<DeprecatedFlyString, Vector<NamedCaptureGroup>> named_capture_groups;
        explicit ParserState(Lexer& lexer)
            : lexer(lexer)
@ -276,6 +281,8 @@ private:
    bool parse_invalid_braced_quantifier(); // Note: This function either parses and *fails*, or doesn't parse anything and returns false.
    Optional<u8> parse_legacy_octal_escape();
    bool has_duplicate_in_current_alternative(DeprecatedFlyString const& name);
    size_t ensure_total_number_of_capturing_parenthesis();
    void enter_capture_group_scope() { m_capture_groups_in_scope.empend(); }
@ -298,6 +305,9 @@ private:
    // Most patterns should have no need to ever populate this field.
    Optional<size_t> m_total_number_of_capturing_parenthesis;
    // We need to keep track of the current alternative's named capture groups, so we can check for duplicates.
    size_t m_current_alternative_id { 0 };
    // Keep the Annex B. behavior behind a flag, the users can enable it by passing the `ECMAScriptFlags::BrowserExtended` flag.
    bool m_should_use_browser_extended_grammar { false };
--- a/Tests/LibRegex/Regex.cpp
+++ b/Tests/LibRegex/Regex.cpp
@ -598,6 +598,7 @@ TEST_CASE(ECMA262_parse)
        { "(?<a>a)(?<a>b)"sv, regex::Error::DuplicateNamedCapture },
        { "(?<a>a)(?<b>b)(?<a>c)"sv, regex::Error::DuplicateNamedCapture },
        { "(?<a>(?<a>a))"sv, regex::Error::DuplicateNamedCapture },
        { "(?:(?<x>a)|(?<y>a)(?<x>b))(?:(?<z>c)|(?<z>d))"sv }, // Duplicate named capturing groups in separate alternatives should parse correctly
        { "(?<1a>a)"sv, regex::Error::InvalidNameForCaptureGroup },
        { "(?<\\a>a)"sv, regex::Error::InvalidNameForCaptureGroup },
        { "(?<\ta>a)"sv, regex::Error::InvalidNameForCaptureGroup },