LibURL/Pattern: Implement the URL Pattern Tokenizer

The tokenizer is used for both pattern string and constructor string
parsing of URL Patterns.
This commit is contained in:
Shannon Booth 2025-02-18 17:25:02 +13:00 committed by Tim Flynn
parent 70e3a48892
commit f8f21319f9
Notes: github-actions[bot] 2025-03-15 11:40:38 +00:00
5 changed files with 563 additions and 0 deletions

View file

@ -262,6 +262,10 @@
# cmakedefine01 URL_PARSER_DEBUG
#endif
#ifndef URL_PATTERN_DEBUG
# cmakedefine01 URL_PATTERN_DEBUG
#endif
#ifndef UTF8_DEBUG
# cmakedefine01 UTF8_DEBUG
#endif

View file

@ -8,6 +8,7 @@ set(SOURCES
URL.cpp
${PUBLIC_SUFFIX_SOURCES}
Pattern/Pattern.cpp
Pattern/Tokenizer.cpp
)
serenity_lib(LibURL url)

View file

@ -0,0 +1,440 @@
/*
* Copyright (c) 2025, Shannon Booth <shannon@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Debug.h>
#include <LibURL/Pattern/Tokenizer.h>
#include <LibUnicode/CharacterTypes.h>
namespace URL::Pattern {
StringView Token::type_to_string(Token::Type type)
{
switch (type) {
case Token::Type::Open:
return "Open"sv;
case Token::Type::Close:
return "Close"sv;
case Token::Type::Regexp:
return "Regexp"sv;
case Token::Type::Name:
return "Name"sv;
case Token::Type::Char:
return "Char"sv;
case Token::Type::EscapedChar:
return "EscapedChar"sv;
case Token::Type::OtherModifier:
return "OtherModifier"sv;
case Token::Type::Asterisk:
return "Asterisk"sv;
case Token::Type::End:
return "End"sv;
case Token::Type::InvalidChar:
return "InvalidChar"sv;
}
VERIFY_NOT_REACHED();
}
String Token::to_string() const
{
return MUST(String::formatted("{}, index: {}, value: '{}'", type_to_string(type), index, value));
}
Tokenizer::Tokenizer(Utf8View const& input, Policy policy)
: m_input(input)
, m_policy(policy)
{
}
// https://urlpattern.spec.whatwg.org/#tokenize
PatternErrorOr<Vector<Token>> Tokenizer::tokenize(Utf8View const& input, Tokenizer::Policy policy)
{
dbgln_if(URL_PATTERN_DEBUG, "URLPattern tokenizing input: '{}'", input.as_string());
VERIFY(input.validate());
// 1. Let tokenizer be a new tokenizer.
// 2. Set tokenizers input to input.
// 3. Set tokenizers policy to policy.
Tokenizer tokenizer { input, policy };
// 4. While tokenizers index is less than tokenizers input's code point length:
while (tokenizer.m_index < tokenizer.m_input.length()) {
// 1. Run seek and get the next code point given tokenizer and tokenizers index.
tokenizer.seek_and_get_the_next_code_point(tokenizer.m_index);
// 2. If tokenizers code point is U+002A (*):
if (tokenizer.m_code_point == '*') {
// 1. Run add a token with default position and length given tokenizer and "asterisk".
tokenizer.add_a_token_with_default_position_and_length(Token::Type::Asterisk);
// 2. Continue.
continue;
}
// 3. If tokenizers code point is U+002B (+) or U+003F (?):
if (tokenizer.m_code_point == '+' || tokenizer.m_code_point == '?') {
// 1. Run add a token with default position and length given tokenizer and "other-modifier".
tokenizer.add_a_token_with_default_position_and_length(Token::Type::OtherModifier);
// 2. Continue.
continue;
}
// 4. If tokenizers code point is U+005C (\):
if (tokenizer.m_code_point == '\\') {
// 1. If tokenizers index is equal to tokenizers input's code point length 1:
if (tokenizer.m_index == tokenizer.m_input.length() - 1) {
// 1. Run process a tokenizing error given tokenizer, tokenizers next index, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(tokenizer.m_next_index, tokenizer.m_index));
// 2. Continue.
continue;
}
// 2. Let escaped index be tokenizers next index.
auto escaped_index = tokenizer.m_next_index;
// 3. Run get the next code point given tokenizer.
tokenizer.get_the_next_code_point();
// 4. Run add a token with default length given tokenizer, "escaped-char", tokenizers next index, and escaped index.
tokenizer.add_a_token_with_default_length(Token::Type::EscapedChar, tokenizer.m_next_index, escaped_index);
// 5. Continue.
continue;
}
// 5. If tokenizers code point is U+007B ({):
if (tokenizer.m_code_point == '{') {
// 1. Run add a token with default position and length given tokenizer and "open".
tokenizer.add_a_token_with_default_position_and_length(Token::Type::Open);
// 2. Continue.
continue;
}
// 6. If tokenizers code point is U+007D (}):
if (tokenizer.m_code_point == '}') {
// 1. Run add a token with default position and length given tokenizer and "close".
tokenizer.add_a_token_with_default_position_and_length(Token::Type::Close);
// 2. Continue.
continue;
}
// 1. If tokenizers code point is U+003A (:):
if (tokenizer.m_code_point == ':') {
// 1. Let name position be tokenizers next index.
auto name_position = tokenizer.m_next_index;
// 2. Let name start be name position.
auto name_start = name_position;
// 3. While name position is less than tokenizers input's code point length:
while (name_position < tokenizer.m_input.length()) {
// 1. Run seek and get the next code point given tokenizer and name position.
tokenizer.seek_and_get_the_next_code_point(name_position);
// 2. Let first code point be true if name position equals name start and false otherwise.
bool first_code_point = name_position == name_start;
// 3. Let valid code point be the result of running is a valid name code point given tokenizers code point and first code point.
bool valid_code_point = is_a_valid_name_code_point(tokenizer.m_code_point, first_code_point);
// 4. If valid code point is false break.
if (!valid_code_point)
break;
// 5. Set name position to tokenizers next index.
name_position = tokenizer.m_next_index;
}
// 4. If name position is less than or equal to name start:
if (name_position <= name_start) {
// 1. Run process a tokenizing error given tokenizer, name start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(name_start, tokenizer.m_index));
// 2. Continue.
continue;
}
// 5. Run add a token with default length given tokenizer, "name", name position, and name start.
tokenizer.add_a_token_with_default_length(Token::Type::Name, name_position, name_start);
// 6. Continue.
continue;
}
// 8. If tokenizers code point is U+0028 (():
if (tokenizer.m_code_point == '(') {
// 1. Let depth be 1.
u32 depth = 1;
// 2. Let regexp position be tokenizers next index.
auto regexp_position = tokenizer.m_next_index;
// 3. Let regexp start be regexp position.
auto regexp_start = regexp_position;
// 4. Let error be false.
bool error = false;
// 5. While regexp position is less than tokenizers input's code point length:
while (regexp_position < tokenizer.m_input.length()) {
// 1. Run seek and get the next code point given tokenizer and regexp position.
tokenizer.seek_and_get_the_next_code_point(regexp_position);
// 2. If the result of running is ASCII given tokenizers code point is false:
if (!is_ascii(tokenizer.m_code_point)) {
// 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
// 2. Set error to true.
error = true;
// 3. Break.
break;
}
// 3. If regexp position equals regexp start and tokenizers code point is U+003F (?):
if (regexp_position == regexp_start && tokenizer.m_code_point == '?') {
// 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
// 2. Set error to true.
error = true;
// 3. Break.
break;
}
// 4. If tokenizers code point is U+005C (\):
if (tokenizer.m_code_point == '\\') {
// 1. If regexp position equals tokenizers input's code point length 1:
if (regexp_position == tokenizer.m_input.length() - 1) {
// 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
// 2. Set error to true.
error = true;
// 3. Break
break;
}
// 2. Run get the next code point given tokenizer.
tokenizer.get_the_next_code_point();
// 3. If the result of running is ASCII given tokenizers code point is false:
if (!is_ascii(tokenizer.m_code_point)) {
// 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
// 2. Set error to true.
error = true;
// 3. Break.
break;
}
// 4. Set regexp position to tokenizers next index.
regexp_position = tokenizer.m_next_index;
// 5. Continue.
continue;
}
// 5. If tokenizers code point is U+0029 ()):
if (tokenizer.m_code_point == ')') {
// 1. Decrement depth by 1.
--depth;
// 1. If depth is 0:
if (depth == 0) {
// 1. Set regexp position to tokenizers next index.
regexp_position = tokenizer.m_next_index;
// 2. Break.
break;
}
}
// 6. Otherwise if tokenizers code point is U+0028 (():
else if (tokenizer.m_code_point == '(') {
// 1. Increment depth by 1.
++depth;
// 2. If regexp position equals tokenizers input's code point length 1:
if (regexp_position == tokenizer.m_input.length() - 1) {
// 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
// 2. Set error to true.
error = true;
// 3. Break
break;
}
// 3. Let temporary position be tokenizers next index.
auto temporary_position = tokenizer.m_next_index;
// 4. Run get the next code point given tokenizer.
tokenizer.get_the_next_code_point();
// 5. If tokenizers code point is not U+003F (?):
if (tokenizer.m_code_point != '?') {
// 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
// 2. Set error to true.
error = true;
// 3. Break.
break;
}
// 6. Set tokenizers next index to temporary position.
tokenizer.m_next_index = temporary_position;
}
// 7. Set regexp position to tokenizers next index.
regexp_position = tokenizer.m_next_index;
}
// 6. If error is true continue.
if (error)
continue;
// 7. If depth is not zero:
if (depth != 0) {
// 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
// 2. Continue.
continue;
}
// 8. Let regexp length be regexp position regexp start 1.
auto regexp_length = regexp_position - regexp_start - 1;
// 9. If regexp length is zero:
if (regexp_length == 0) {
// 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
// 2. Continue.
continue;
}
// 10. Run add a token given tokenizer, "regexp", regexp position, regexp start, and regexp length.
tokenizer.add_a_token(Token::Type::Regexp, regexp_position, regexp_start, regexp_length);
// 11. Continue.
continue;
}
// 9. Run add a token with default position and length given tokenizer and "char".
tokenizer.add_a_token_with_default_position_and_length(Token::Type::Char);
}
// 5. Run add a token with default length given tokenizer, "end", tokenizers index, and tokenizers index.
tokenizer.add_a_token_with_default_length(Token::Type::End, tokenizer.m_index, tokenizer.m_index);
// 6. Return tokenizers token list.
if constexpr (URL_PATTERN_DEBUG) {
for (auto const& token : tokenizer.m_token_list)
dbgln("{}", token.to_string());
}
return tokenizer.m_token_list;
}
// https://urlpattern.spec.whatwg.org/#get-the-next-code-point
void Tokenizer::get_the_next_code_point()
{
// 1. Set tokenizers code point to the Unicode code point in tokenizers input at the position indicated by tokenizers next index.
m_code_point = *m_input.unicode_substring_view(m_next_index, 1).begin();
// 2. Increment tokenizers next index by 1.
++m_next_index;
}
// https://urlpattern.spec.whatwg.org/#seek-and-get-the-next-code-point
void Tokenizer::seek_and_get_the_next_code_point(u32 index)
{
// 1. Set tokenizers next index to index.
m_next_index = index;
// 2. Run get the next code point given tokenizer.
get_the_next_code_point();
}
// https://urlpattern.spec.whatwg.org/#add-a-token
void Tokenizer::add_a_token(Token::Type type, u32 next_position, u32 value_position, u32 value_length)
{
// 1. Let token be a new token.
Token token;
// 2. Set tokens type to type.
token.type = type;
// 3. Set tokens index to tokenizers index.
token.index = m_index;
// 4. Set tokens value to the code point substring from value position with length value length within tokenizers input.
token.value = MUST(String::from_utf8(m_input.unicode_substring_view(value_position, value_length).as_string()));
// 5. Append token to the back of tokenizers token list.
m_token_list.append(move(token));
// 5. Set tokenizers index to next position.
m_index = next_position;
}
// https://urlpattern.spec.whatwg.org/#add-a-token-with-default-length
void Tokenizer::add_a_token_with_default_length(Token::Type type, u32 next_position, u32 value_position)
{
// 1. Let computed length be next position value position.
auto computed_length = next_position - value_position;
// 2. Run add a token given tokenizer, type, next position, value position, and computed length.
add_a_token(type, next_position, value_position, computed_length);
}
// https://urlpattern.spec.whatwg.org/#add-a-token-with-default-position-and-length
void Tokenizer::add_a_token_with_default_position_and_length(Token::Type type)
{
// 1. Run add a token with default length given tokenizer, type, tokenizers next index, and tokenizers index.
add_a_token_with_default_length(type, m_next_index, m_index);
}
// https://urlpattern.spec.whatwg.org/#process-a-tokenizing-error
PatternErrorOr<void> Tokenizer::process_a_tokenizing_error(u32 next_position, u32 value_position)
{
// 1. If tokenizers policy is "strict", then throw a TypeError.
if (m_policy == Policy::Strict)
return ErrorInfo { "Error processing a token"_string }; // FIXME: Improve this error!
// 2. Assert: tokenizers policy is "lenient".
VERIFY(m_policy == Policy::Lenient);
// 3. Run add a token with default length given tokenizer, "invalid-char", next position, and value position.
add_a_token_with_default_length(Token::Type::InvalidChar, next_position, value_position);
return {};
}
// https://urlpattern.spec.whatwg.org/#is-a-valid-name-code-point
bool Tokenizer::is_a_valid_name_code_point(u32 code_point, bool first)
{
// 1. If first is true return the result of checking if code point is contained in the IdentifierStart set of code points.
if (first)
return code_point == '$' || code_point == '_' || Unicode::code_point_has_identifier_start_property(code_point);
// 2. Otherwise return the result of checking if code point is contained in the IdentifierPart set of code points.
return code_point == '$' || Unicode::code_point_has_identifier_continue_property(code_point);
}
}

View file

@ -0,0 +1,117 @@
/*
* Copyright (c) 2025, Shannon Booth <shannon@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/String.h>
#include <LibURL/Pattern/PatternError.h>
namespace URL::Pattern {
// https://urlpattern.spec.whatwg.org/#token
// A token is a struct representing a single lexical token within a pattern string.
struct Token {
// https://urlpattern.spec.whatwg.org/#token-type
enum class Type {
// The token represents a U+007B ({) code point.
Open,
// The token represents a U+007D (}) code point.
Close,
// The token represents a string of the form "(<regular expression>)". The regular expression is required to consist of only ASCII code points.
Regexp,
// The token represents a string of the form ":<name>". The name value is restricted to code points that are consistent with JavaScript identifiers.
Name,
// The token represents a valid pattern code point without any special syntactical meaning.
Char,
// The token represents a code point escaped using a backslash like "\<char>".
EscapedChar,
// The token represents a matching group modifier that is either the U+003F (?) or U+002B (+) code points.
OtherModifier,
// The token represents a U+002A (*) code point that can be either a wildcard matching group or a matching group modifier.
Asterisk,
// The token represents the end of the pattern string.
End,
// The token represents a code point that is invalid in the pattern. This could be because of the code point value
// itself or due to its location within the pattern relative to other syntactic elements.
InvalidChar,
};
// https://urlpattern.spec.whatwg.org/#token-type
// A token has an associated type, a string, initially "invalid-char".
Type type { Type::InvalidChar };
// https://urlpattern.spec.whatwg.org/#token-index
// A token has an associated index, a number, initially 0. It is the position of the first code point in the pattern string represented by the token.
u32 index { 0 };
// https://urlpattern.spec.whatwg.org/#token-value
// A token has an associated value, a string, initially the empty string. It contains the code points from the pattern string represented by the token.
String value;
String to_string() const;
static StringView type_to_string(Token::Type);
};
// https://urlpattern.spec.whatwg.org/#tokenizer
// A tokenizer is a struct.
class Tokenizer {
public:
// https://urlpattern.spec.whatwg.org/#tokenize-policy
// A tokenize policy is a string that must be either "strict" or "lenient".
enum class Policy {
Strict,
Lenient,
};
static PatternErrorOr<Vector<Token>> tokenize(Utf8View const&, Policy);
static bool is_a_valid_name_code_point(u32 code_point, bool first);
private:
Tokenizer(Utf8View const& input, Policy);
void get_the_next_code_point();
void seek_and_get_the_next_code_point(u32 index);
void add_a_token(Token::Type, u32 next_position, u32 value_position, u32 value_length);
void add_a_token_with_default_length(Token::Type, u32 next_position, u32 value_position);
void add_a_token_with_default_position_and_length(Token::Type);
PatternErrorOr<void> process_a_tokenizing_error(u32 next_position, u32 value_position);
// https://urlpattern.spec.whatwg.org/#tokenizer-input
// A tokenizer has an associated input, a pattern string, initially the empty string.
Utf8View m_input;
// https://urlpattern.spec.whatwg.org/#tokenizer-policy
// A tokenizer has an associated policy, a tokenize policy, initially "strict".
Policy m_policy { Policy::Strict };
// https://urlpattern.spec.whatwg.org/#tokenizer-token-list
// A tokenizer has an associated token list, a token list, initially an empty list.
Vector<Token> m_token_list;
// https://urlpattern.spec.whatwg.org/#tokenizer-index
// A tokenizer has an associated index, a number, initially 0.
size_t m_index { 0 };
// https://urlpattern.spec.whatwg.org/#tokenizer-next-index
// A tokenizer has an associated next index, a number, initially 0.
size_t m_next_index { 0 };
// https://urlpattern.spec.whatwg.org/#tokenizer-code-point
// A tokenizer has an associated code point, a Unicode code point, initially null.
u32 m_code_point {};
};
}

View file

@ -61,6 +61,7 @@ set(TLS_DEBUG ON)
set(TOKENIZER_TRACE_DEBUG ON)
set(UPDATE_LAYOUT_DEBUG ON)
set(URL_PARSER_DEBUG ON)
set(URL_PATTERN_DEBUG ON)
set(UTF8_DEBUG ON)
set(VPX_DEBUG ON)
set(WASI_DEBUG ON)