LibURL/Pattern: Implement the URL Pattern Tokenizer

The tokenizer is used for both pattern string and constructor string parsing of URL Patterns.
Author: https://github.com/shannonbooth Commit: f8f21319f9 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/3812 Reviewed-by: https://github.com/tcl3 Reviewed-by: https://github.com/trflynn89
2025-09-26 19:28:59 +00:00 · 2025-02-18 17:25:02 +13:00 · 2025-02-18 17:25:02 +13:00 · f8f21319f9 · 2025-03-15 11:40:38 +00:00
commit f8f21319f9
parent 70e3a48892
5 changed files with 563 additions and 0 deletions
--- a/AK/Debug.h.in
+++ b/AK/Debug.h.in
@ -262,6 +262,10 @@
 #    cmakedefine01 URL_PARSER_DEBUG
 #endif

+#ifndef URL_PATTERN_DEBUG
+#    cmakedefine01 URL_PATTERN_DEBUG
+#endif
+
 #ifndef UTF8_DEBUG
 #    cmakedefine01 UTF8_DEBUG
 #endif
--- a/Libraries/LibURL/CMakeLists.txt
+++ b/Libraries/LibURL/CMakeLists.txt
@ -8,6 +8,7 @@ set(SOURCES
    URL.cpp
    ${PUBLIC_SUFFIX_SOURCES}
    Pattern/Pattern.cpp
+    Pattern/Tokenizer.cpp
 )

 serenity_lib(LibURL url)
--- a/Libraries/LibURL/Pattern/Tokenizer.cpp
+++ b/Libraries/LibURL/Pattern/Tokenizer.cpp
@ -0,0 +1,440 @@
+/*
+ * Copyright (c) 2025, Shannon Booth <shannon@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#include <AK/Debug.h>
+#include <LibURL/Pattern/Tokenizer.h>
+#include <LibUnicode/CharacterTypes.h>
+
+namespace URL::Pattern {
+
+StringView Token::type_to_string(Token::Type type)
+{
+    switch (type) {
+    case Token::Type::Open:
+        return "Open"sv;
+    case Token::Type::Close:
+        return "Close"sv;
+    case Token::Type::Regexp:
+        return "Regexp"sv;
+    case Token::Type::Name:
+        return "Name"sv;
+    case Token::Type::Char:
+        return "Char"sv;
+    case Token::Type::EscapedChar:
+        return "EscapedChar"sv;
+    case Token::Type::OtherModifier:
+        return "OtherModifier"sv;
+    case Token::Type::Asterisk:
+        return "Asterisk"sv;
+    case Token::Type::End:
+        return "End"sv;
+    case Token::Type::InvalidChar:
+        return "InvalidChar"sv;
+    }
+    VERIFY_NOT_REACHED();
+}
+
+String Token::to_string() const
+{
+    return MUST(String::formatted("{}, index: {}, value: '{}'", type_to_string(type), index, value));
+}
+
+Tokenizer::Tokenizer(Utf8View const& input, Policy policy)
+    : m_input(input)
+    , m_policy(policy)
+{
+}
+
+// https://urlpattern.spec.whatwg.org/#tokenize
+PatternErrorOr<Vector<Token>> Tokenizer::tokenize(Utf8View const& input, Tokenizer::Policy policy)
+{
+    dbgln_if(URL_PATTERN_DEBUG, "URLPattern tokenizing input: '{}'", input.as_string());
+    VERIFY(input.validate());
+
+    // 1. Let tokenizer be a new tokenizer.
+    // 2. Set tokenizer’s input to input.
+    // 3. Set tokenizer’s policy to policy.
+    Tokenizer tokenizer { input, policy };
+
+    // 4. While tokenizer’s index is less than tokenizer’s input's code point length:
+    while (tokenizer.m_index < tokenizer.m_input.length()) {
+        // 1. Run seek and get the next code point given tokenizer and tokenizer’s index.
+        tokenizer.seek_and_get_the_next_code_point(tokenizer.m_index);
+
+        // 2. If tokenizer’s code point is U+002A (*):
+        if (tokenizer.m_code_point == '*') {
+            // 1. Run add a token with default position and length given tokenizer and "asterisk".
+            tokenizer.add_a_token_with_default_position_and_length(Token::Type::Asterisk);
+
+            // 2. Continue.
+            continue;
+        }
+
+        // 3. If tokenizer’s code point is U+002B (+) or U+003F (?):
+        if (tokenizer.m_code_point == '+' || tokenizer.m_code_point == '?') {
+            // 1. Run add a token with default position and length given tokenizer and "other-modifier".
+            tokenizer.add_a_token_with_default_position_and_length(Token::Type::OtherModifier);
+
+            // 2. Continue.
+            continue;
+        }
+
+        // 4. If tokenizer’s code point is U+005C (\):
+        if (tokenizer.m_code_point == '\\') {
+            // 1. If tokenizer’s index is equal to tokenizer’s input's code point length − 1:
+            if (tokenizer.m_index == tokenizer.m_input.length() - 1) {
+                // 1. Run process a tokenizing error given tokenizer, tokenizer’s next index, and tokenizer’s index.
+                TRY(tokenizer.process_a_tokenizing_error(tokenizer.m_next_index, tokenizer.m_index));
+
+                // 2. Continue.
+                continue;
+            }
+
+            // 2. Let escaped index be tokenizer’s next index.
+            auto escaped_index = tokenizer.m_next_index;
+
+            // 3. Run get the next code point given tokenizer.
+            tokenizer.get_the_next_code_point();
+
+            // 4. Run add a token with default length given tokenizer, "escaped-char", tokenizer’s next index, and escaped index.
+            tokenizer.add_a_token_with_default_length(Token::Type::EscapedChar, tokenizer.m_next_index, escaped_index);
+
+            // 5. Continue.
+            continue;
+        }
+
+        // 5. If tokenizer’s code point is U+007B ({):
+        if (tokenizer.m_code_point == '{') {
+            // 1. Run add a token with default position and length given tokenizer and "open".
+            tokenizer.add_a_token_with_default_position_and_length(Token::Type::Open);
+
+            // 2. Continue.
+            continue;
+        }
+
+        // 6. If tokenizer’s code point is U+007D (}):
+        if (tokenizer.m_code_point == '}') {
+            // 1. Run add a token with default position and length given tokenizer and "close".
+            tokenizer.add_a_token_with_default_position_and_length(Token::Type::Close);
+
+            // 2. Continue.
+            continue;
+        }
+
+        // 1. If tokenizer’s code point is U+003A (:):
+        if (tokenizer.m_code_point == ':') {
+            // 1. Let name position be tokenizer’s next index.
+            auto name_position = tokenizer.m_next_index;
+
+            // 2. Let name start be name position.
+            auto name_start = name_position;
+
+            // 3. While name position is less than tokenizer’s input's code point length:
+            while (name_position < tokenizer.m_input.length()) {
+                // 1. Run seek and get the next code point given tokenizer and name position.
+                tokenizer.seek_and_get_the_next_code_point(name_position);
+
+                // 2. Let first code point be true if name position equals name start and false otherwise.
+                bool first_code_point = name_position == name_start;
+
+                // 3. Let valid code point be the result of running is a valid name code point given tokenizer’s code point and first code point.
+                bool valid_code_point = is_a_valid_name_code_point(tokenizer.m_code_point, first_code_point);
+
+                // 4. If valid code point is false break.
+                if (!valid_code_point)
+                    break;
+
+                // 5. Set name position to tokenizer’s next index.
+                name_position = tokenizer.m_next_index;
+            }
+
+            // 4. If name position is less than or equal to name start:
+            if (name_position <= name_start) {
+                // 1. Run process a tokenizing error given tokenizer, name start, and tokenizer’s index.
+                TRY(tokenizer.process_a_tokenizing_error(name_start, tokenizer.m_index));
+
+                // 2. Continue.
+                continue;
+            }
+
+            // 5. Run add a token with default length given tokenizer, "name", name position, and name start.
+            tokenizer.add_a_token_with_default_length(Token::Type::Name, name_position, name_start);
+
+            // 6. Continue.
+            continue;
+        }
+
+        // 8. If tokenizer’s code point is U+0028 (():
+        if (tokenizer.m_code_point == '(') {
+            // 1. Let depth be 1.
+            u32 depth = 1;
+
+            // 2. Let regexp position be tokenizer’s next index.
+            auto regexp_position = tokenizer.m_next_index;
+
+            // 3. Let regexp start be regexp position.
+            auto regexp_start = regexp_position;
+
+            // 4. Let error be false.
+            bool error = false;
+
+            // 5. While regexp position is less than tokenizer’s input's code point length:
+            while (regexp_position < tokenizer.m_input.length()) {
+                // 1. Run seek and get the next code point given tokenizer and regexp position.
+                tokenizer.seek_and_get_the_next_code_point(regexp_position);
+
+                // 2. If the result of running is ASCII given tokenizer’s code point is false:
+                if (!is_ascii(tokenizer.m_code_point)) {
+                    // 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizer’s index.
+                    TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
+
+                    // 2. Set error to true.
+                    error = true;
+
+                    // 3. Break.
+                    break;
+                }
+
+                // 3. If regexp position equals regexp start and tokenizer’s code point is U+003F (?):
+                if (regexp_position == regexp_start && tokenizer.m_code_point == '?') {
+                    // 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizer’s index.
+                    TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
+
+                    // 2. Set error to true.
+                    error = true;
+
+                    // 3. Break.
+                    break;
+                }
+
+                // 4. If tokenizer’s code point is U+005C (\):
+                if (tokenizer.m_code_point == '\\') {
+                    // 1. If regexp position equals tokenizer’s input's code point length − 1:
+                    if (regexp_position == tokenizer.m_input.length() - 1) {
+                        // 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizer’s index.
+                        TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
+
+                        // 2. Set error to true.
+                        error = true;
+
+                        // 3. Break
+                        break;
+                    }
+
+                    // 2. Run get the next code point given tokenizer.
+                    tokenizer.get_the_next_code_point();
+
+                    // 3. If the result of running is ASCII given tokenizer’s code point is false:
+                    if (!is_ascii(tokenizer.m_code_point)) {
+                        // 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizer’s index.
+                        TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
+
+                        // 2. Set error to true.
+                        error = true;
+
+                        // 3. Break.
+                        break;
+                    }
+
+                    // 4. Set regexp position to tokenizer’s next index.
+                    regexp_position = tokenizer.m_next_index;
+
+                    // 5. Continue.
+                    continue;
+                }
+
+                // 5. If tokenizer’s code point is U+0029 ()):
+                if (tokenizer.m_code_point == ')') {
+                    // 1. Decrement depth by 1.
+                    --depth;
+
+                    // 1. If depth is 0:
+                    if (depth == 0) {
+                        // 1. Set regexp position to tokenizer’s next index.
+                        regexp_position = tokenizer.m_next_index;
+
+                        // 2. Break.
+                        break;
+                    }
+                }
+                // 6. Otherwise if tokenizer’s code point is U+0028 (():
+                else if (tokenizer.m_code_point == '(') {
+                    // 1. Increment depth by 1.
+                    ++depth;
+
+                    // 2. If regexp position equals tokenizer’s input's code point length − 1:
+                    if (regexp_position == tokenizer.m_input.length() - 1) {
+                        // 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizer’s index.
+                        TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
+
+                        // 2. Set error to true.
+                        error = true;
+
+                        // 3. Break
+                        break;
+                    }
+
+                    // 3. Let temporary position be tokenizer’s next index.
+                    auto temporary_position = tokenizer.m_next_index;
+
+                    // 4. Run get the next code point given tokenizer.
+                    tokenizer.get_the_next_code_point();
+
+                    // 5. If tokenizer’s code point is not U+003F (?):
+                    if (tokenizer.m_code_point != '?') {
+                        // 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizer’s index.
+                        TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
+
+                        // 2. Set error to true.
+                        error = true;
+
+                        // 3. Break.
+                        break;
+                    }
+
+                    // 6. Set tokenizer’s next index to temporary position.
+                    tokenizer.m_next_index = temporary_position;
+                }
+
+                // 7. Set regexp position to tokenizer’s next index.
+                regexp_position = tokenizer.m_next_index;
+            }
+
+            // 6. If error is true continue.
+            if (error)
+                continue;
+
+            // 7. If depth is not zero:
+            if (depth != 0) {
+                // 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizer’s index.
+                TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
+
+                // 2. Continue.
+                continue;
+            }
+
+            // 8. Let regexp length be regexp position − regexp start − 1.
+            auto regexp_length = regexp_position - regexp_start - 1;
+
+            // 9. If regexp length is zero:
+            if (regexp_length == 0) {
+                // 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizer’s index.
+                TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
+
+                // 2. Continue.
+                continue;
+            }
+
+            // 10. Run add a token given tokenizer, "regexp", regexp position, regexp start, and regexp length.
+            tokenizer.add_a_token(Token::Type::Regexp, regexp_position, regexp_start, regexp_length);
+
+            // 11. Continue.
+            continue;
+        }
+
+        // 9. Run add a token with default position and length given tokenizer and "char".
+        tokenizer.add_a_token_with_default_position_and_length(Token::Type::Char);
+    }
+
+    // 5. Run add a token with default length given tokenizer, "end", tokenizer’s index, and tokenizer’s index.
+    tokenizer.add_a_token_with_default_length(Token::Type::End, tokenizer.m_index, tokenizer.m_index);
+
+    // 6. Return tokenizer’s token list.
+    if constexpr (URL_PATTERN_DEBUG) {
+        for (auto const& token : tokenizer.m_token_list)
+            dbgln("{}", token.to_string());
+    }
+
+    return tokenizer.m_token_list;
+}
+
+// https://urlpattern.spec.whatwg.org/#get-the-next-code-point
+void Tokenizer::get_the_next_code_point()
+{
+    // 1. Set tokenizer’s code point to the Unicode code point in tokenizer’s input at the position indicated by tokenizer’s next index.
+    m_code_point = *m_input.unicode_substring_view(m_next_index, 1).begin();
+
+    // 2. Increment tokenizer’s next index by 1.
+    ++m_next_index;
+}
+
+// https://urlpattern.spec.whatwg.org/#seek-and-get-the-next-code-point
+void Tokenizer::seek_and_get_the_next_code_point(u32 index)
+{
+    // 1. Set tokenizer’s next index to index.
+    m_next_index = index;
+
+    // 2. Run get the next code point given tokenizer.
+    get_the_next_code_point();
+}
+
+// https://urlpattern.spec.whatwg.org/#add-a-token
+void Tokenizer::add_a_token(Token::Type type, u32 next_position, u32 value_position, u32 value_length)
+{
+    // 1. Let token be a new token.
+    Token token;
+
+    // 2. Set token’s type to type.
+    token.type = type;
+
+    // 3. Set token’s index to tokenizer’s index.
+    token.index = m_index;
+
+    // 4. Set token’s value to the code point substring from value position with length value length within tokenizer’s input.
+    token.value = MUST(String::from_utf8(m_input.unicode_substring_view(value_position, value_length).as_string()));
+
+    // 5. Append token to the back of tokenizer’s token list.
+    m_token_list.append(move(token));
+
+    // 5. Set tokenizer’s index to next position.
+    m_index = next_position;
+}
+
+// https://urlpattern.spec.whatwg.org/#add-a-token-with-default-length
+void Tokenizer::add_a_token_with_default_length(Token::Type type, u32 next_position, u32 value_position)
+{
+    // 1. Let computed length be next position − value position.
+    auto computed_length = next_position - value_position;
+
+    // 2. Run add a token given tokenizer, type, next position, value position, and computed length.
+    add_a_token(type, next_position, value_position, computed_length);
+}
+
+// https://urlpattern.spec.whatwg.org/#add-a-token-with-default-position-and-length
+void Tokenizer::add_a_token_with_default_position_and_length(Token::Type type)
+{
+    // 1. Run add a token with default length given tokenizer, type, tokenizer’s next index, and tokenizer’s index.
+    add_a_token_with_default_length(type, m_next_index, m_index);
+}
+
+// https://urlpattern.spec.whatwg.org/#process-a-tokenizing-error
+PatternErrorOr<void> Tokenizer::process_a_tokenizing_error(u32 next_position, u32 value_position)
+{
+    // 1. If tokenizer’s policy is "strict", then throw a TypeError.
+    if (m_policy == Policy::Strict)
+        return ErrorInfo { "Error processing a token"_string }; // FIXME: Improve this error!
+
+    // 2. Assert: tokenizer’s policy is "lenient".
+    VERIFY(m_policy == Policy::Lenient);
+
+    // 3. Run add a token with default length given tokenizer, "invalid-char", next position, and value position.
+    add_a_token_with_default_length(Token::Type::InvalidChar, next_position, value_position);
+
+    return {};
+}
+
+// https://urlpattern.spec.whatwg.org/#is-a-valid-name-code-point
+bool Tokenizer::is_a_valid_name_code_point(u32 code_point, bool first)
+{
+    // 1. If first is true return the result of checking if code point is contained in the IdentifierStart set of code points.
+    if (first)
+        return code_point == '$' || code_point == '_' || Unicode::code_point_has_identifier_start_property(code_point);
+
+    // 2. Otherwise return the result of checking if code point is contained in the IdentifierPart set of code points.
+    return code_point == '$' || Unicode::code_point_has_identifier_continue_property(code_point);
+}
+
+}
--- a/Libraries/LibURL/Pattern/Tokenizer.h
+++ b/Libraries/LibURL/Pattern/Tokenizer.h
@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2025, Shannon Booth <shannon@serenityos.org>
+ *
+ * SPDX-License-Identifier: BSD-2-Clause
+ */
+
+#pragma once
+
+#include <AK/String.h>
+#include <LibURL/Pattern/PatternError.h>
+
+namespace URL::Pattern {
+
+// https://urlpattern.spec.whatwg.org/#token
+// A token is a struct representing a single lexical token within a pattern string.
+struct Token {
+    // https://urlpattern.spec.whatwg.org/#token-type
+    enum class Type {
+        // The token represents a U+007B ({) code point.
+        Open,
+
+        // The token represents a U+007D (}) code point.
+        Close,
+
+        // The token represents a string of the form "(<regular expression>)". The regular expression is required to consist of only ASCII code points.
+        Regexp,
+
+        // The token represents a string of the form ":<name>". The name value is restricted to code points that are consistent with JavaScript identifiers.
+        Name,
+
+        // The token represents a valid pattern code point without any special syntactical meaning.
+        Char,
+
+        // The token represents a code point escaped using a backslash like "\<char>".
+        EscapedChar,
+
+        // The token represents a matching group modifier that is either the U+003F (?) or U+002B (+) code points.
+        OtherModifier,
+
+        // The token represents a U+002A (*) code point that can be either a wildcard matching group or a matching group modifier.
+        Asterisk,
+
+        // The token represents the end of the pattern string.
+        End,
+
+        // The token represents a code point that is invalid in the pattern. This could be because of the code point value
+        // itself or due to its location within the pattern relative to other syntactic elements.
+        InvalidChar,
+    };
+
+    // https://urlpattern.spec.whatwg.org/#token-type
+    // A token has an associated type, a string, initially "invalid-char".
+    Type type { Type::InvalidChar };
+
+    // https://urlpattern.spec.whatwg.org/#token-index
+    // A token has an associated index, a number, initially 0. It is the position of the first code point in the pattern string represented by the token.
+    u32 index { 0 };
+
+    // https://urlpattern.spec.whatwg.org/#token-value
+    // A token has an associated value, a string, initially the empty string. It contains the code points from the pattern string represented by the token.
+    String value;
+
+    String to_string() const;
+    static StringView type_to_string(Token::Type);
+};
+
+// https://urlpattern.spec.whatwg.org/#tokenizer
+// A tokenizer is a struct.
+class Tokenizer {
+public:
+    // https://urlpattern.spec.whatwg.org/#tokenize-policy
+    // A tokenize policy is a string that must be either "strict" or "lenient".
+    enum class Policy {
+        Strict,
+        Lenient,
+    };
+
+    static PatternErrorOr<Vector<Token>> tokenize(Utf8View const&, Policy);
+
+    static bool is_a_valid_name_code_point(u32 code_point, bool first);
+
+private:
+    Tokenizer(Utf8View const& input, Policy);
+
+    void get_the_next_code_point();
+    void seek_and_get_the_next_code_point(u32 index);
+    void add_a_token(Token::Type, u32 next_position, u32 value_position, u32 value_length);
+    void add_a_token_with_default_length(Token::Type, u32 next_position, u32 value_position);
+    void add_a_token_with_default_position_and_length(Token::Type);
+    PatternErrorOr<void> process_a_tokenizing_error(u32 next_position, u32 value_position);
+
+    // https://urlpattern.spec.whatwg.org/#tokenizer-input
+    // A tokenizer has an associated input, a pattern string, initially the empty string.
+    Utf8View m_input;
+
+    // https://urlpattern.spec.whatwg.org/#tokenizer-policy
+    // A tokenizer has an associated policy, a tokenize policy, initially "strict".
+    Policy m_policy { Policy::Strict };
+
+    // https://urlpattern.spec.whatwg.org/#tokenizer-token-list
+    // A tokenizer has an associated token list, a token list, initially an empty list.
+    Vector<Token> m_token_list;
+
+    // https://urlpattern.spec.whatwg.org/#tokenizer-index
+    // A tokenizer has an associated index, a number, initially 0.
+    size_t m_index { 0 };
+
+    // https://urlpattern.spec.whatwg.org/#tokenizer-next-index
+    // A tokenizer has an associated next index, a number, initially 0.
+    size_t m_next_index { 0 };
+
+    // https://urlpattern.spec.whatwg.org/#tokenizer-code-point
+    // A tokenizer has an associated code point, a Unicode code point, initially null.
+    u32 m_code_point {};
+};
+
+}
--- a/Meta/CMake/all_the_debug_macros.cmake
+++ b/Meta/CMake/all_the_debug_macros.cmake
@ -61,6 +61,7 @@ set(TLS_DEBUG ON)
 set(TOKENIZER_TRACE_DEBUG ON)
 set(UPDATE_LAYOUT_DEBUG ON)
 set(URL_PARSER_DEBUG ON)
+set(URL_PATTERN_DEBUG ON)
 set(UTF8_DEBUG ON)
 set(VPX_DEBUG ON)
 set(WASI_DEBUG ON)