LibURL/Pattern: Implement the URLPattern Pattern Parser

This commit is contained in:
Shannon Booth 2025-03-18 19:31:46 +13:00 committed by Tim Flynn
parent 45d852d14b
commit 934f1ec30d
Notes: github-actions[bot] 2025-04-06 12:27:35 +00:00
3 changed files with 476 additions and 0 deletions

View file

@ -13,6 +13,7 @@ set(SOURCES
Pattern/Options.cpp
Pattern/Part.cpp
Pattern/Pattern.cpp
Pattern/PatternParser.cpp
Pattern/String.cpp
Pattern/Tokenizer.cpp
)

View file

@ -0,0 +1,405 @@
/*
* Copyright (c) 2025, Shannon Booth <shannon@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibURL/Pattern/Component.h>
#include <LibURL/Pattern/PatternParser.h>
#include <LibURL/Pattern/String.h>
namespace URL::Pattern {
PatternParser::PatternParser(EncodingCallback encoding_callback, String segment_wildcard_regexp)
: m_encoding_callback(move(encoding_callback))
, m_segment_wildcard_regexp(move(segment_wildcard_regexp))
{
}
// https://urlpattern.spec.whatwg.org/#consume-a-required-token
PatternErrorOr<void> PatternParser::consume_a_required_token(Token::Type type)
{
// 1. Let result be the result of running try to consume a token given parser and type.
auto result = try_to_consume_a_token(type);
// 2. If result is null, then throw a TypeError.
if (!result.has_value())
return ErrorInfo { MUST(String::formatted("Missing required token '{}' in URL pattern", Token::type_to_string(type))) };
// 3. Return result.
// NOTE: No caller actually needs the result, so we just ignore it.
return {};
}
// https://urlpattern.spec.whatwg.org/#consume-text
String PatternParser::consume_text()
{
// 1. Let result be the empty string.
StringBuilder result;
// 1. While true:
while (true) {
// 1. Let token be the result of running try to consume a token given parser and "char".
auto token = try_to_consume_a_token(Token::Type::Char);
// 2. If token is null, then set token to the result of running try to consume a token given parser and "escaped-char".
if (!token.has_value())
token = try_to_consume_a_token(Token::Type::EscapedChar);
// 3. If token is null, then break.
if (!token.has_value())
break;
// 4. Append tokens value to the end of result.
result.append(token->value);
}
// 2. Return result.
return result.to_string_without_validation();
}
// https://urlpattern.spec.whatwg.org/#maybe-add-a-part-from-the-pending-fixed-value
PatternErrorOr<void> PatternParser::maybe_add_a_part_from_the_pending_fixed_value()
{
// 1. If parsers pending fixed value is the empty string, then return.
if (m_pending_fixed_value.is_empty())
return {};
// 2. Let encoded value be the result of running parsers encoding callback given parsers pending fixed value.
auto encoded_value = TRY(m_encoding_callback(m_pending_fixed_value.to_string_without_validation()));
// 3. Set parsers pending fixed value to the empty string.
m_pending_fixed_value.clear();
// 4. Let part be a new part whose type is "fixed-text", value is encoded value, and modifier is "none".
// 5. Append part to parsers part list.
m_part_list.append({ Part::Type::FixedText, move(encoded_value), Part::Modifier::None });
return {};
}
// https://urlpattern.spec.whatwg.org/#is-a-duplicate-name
bool PatternParser::is_a_duplicate_name(String const& name) const
{
// 1. For each part of parsers part list:
for (auto const& part : m_part_list) {
// 1. If parts name is name, then return true.
if (part.name == name)
return true;
}
// 2. Return false.
return false;
}
// https://urlpattern.spec.whatwg.org/#add-a-part
PatternErrorOr<void> PatternParser::add_a_part(String const& prefix, Optional<Token const&> name_token,
Optional<Token const&> regexp_or_wildcard_token, String const& suffix, Optional<Token const&> modifier_token)
{
// 1. Let modifier be "none".
auto modifier = Part::Modifier::None;
// 2. If modifier token is not null:
if (modifier_token.has_value()) {
// 1. If modifier tokens value is "?" then set modifier to "optional".
if (modifier_token->value == "?"sv) {
modifier = Part::Modifier::Optional;
}
// 2. Otherwise if modifier tokens value is "*" then set modifier to "zero-or-more".
else if (modifier_token->value == "*"sv) {
modifier = Part::Modifier::ZeroOrMore;
}
// 3. Otherwise if modifier tokens value is "+" then set modifier to "one-or-more".
else if (modifier_token->value == "+"sv) {
modifier = Part::Modifier::OneOrMore;
}
}
// 3. If name token is null and regexp or wildcard token is null and modifier is "none":
// NOTE: This was a "{foo}" grouping. We add this to the pending fixed value so that it will be combined with
// any previous or subsequent text.
if (!name_token.has_value() && !regexp_or_wildcard_token.has_value() && modifier == Part::Modifier::None) {
// 1. Append prefix to the end of parsers pending fixed value.
m_pending_fixed_value.append(prefix);
// 2. Return.
return {};
}
// 4. Run maybe add a part from the pending fixed value given parser.
TRY(maybe_add_a_part_from_the_pending_fixed_value());
// 5. If name token is null and regexp or wildcard token is null:
// NOTE: This was a "{foo}?" grouping. The modifier means we cannot combine it with other text. Therefore we
// add it as a part immediately.
if (!name_token.has_value() && !regexp_or_wildcard_token.has_value()) {
// 1. Assert: suffix is the empty string.
VERIFY(suffix.is_empty());
// 2. If prefix is the empty string, then return.
if (prefix.is_empty())
return {};
// 3. Let encoded value be the result of running parsers encoding callback given prefix.
auto encoded_value = TRY(m_encoding_callback(prefix));
// 4. Let part be a new part whose type is "fixed-text", value is encoded value, and modifier is modifier.
// 5. Append part to parsers part list.
m_part_list.append({ Part::Type::FixedText, move(encoded_value), modifier });
// 6. Return.
return {};
}
// 6. Let regexp value be the empty string.
// NOTE: Next, we convert the regexp or wildcard token into a regular expression.
String regexp_value;
// 7. If regexp or wildcard token is null, then set regexp value to parsers segment wildcard regexp.
if (!regexp_or_wildcard_token.has_value()) {
regexp_value = m_segment_wildcard_regexp;
}
// 8. Otherwise if regexp or wildcard tokens type is "asterisk", then set regexp value to the full wildcard regexp value.
else if (regexp_or_wildcard_token->type == Token::Type::Asterisk) {
regexp_value = MUST(String::from_utf8(full_wildcard_regexp_value));
}
// 9. Otherwise set regexp value to regexp or wildcard tokens value.
else {
regexp_value = regexp_or_wildcard_token->value;
}
// 10. Let type be "regexp".
// NOTE: Next, we convert regexp value into a part type. We make sure to go to a regular expression first so
// that an equivalent "regexp" token will be treated the same as a "name" or "asterisk" token.
auto type = Part::Type::Regexp;
// 11. If regexp value is parsers segment wildcard regexp:
if (regexp_value == m_segment_wildcard_regexp) {
// 1. Set type to "segment-wildcard".
type = Part::Type::SegmentWildcard;
// 2. Set regexp value to the empty string.
regexp_value = String {};
}
// 12. Otherwise if regexp value is the full wildcard regexp value:
else if (regexp_value == full_wildcard_regexp_value) {
// 1. Set type to "full-wildcard".
type = Part::Type::FullWildcard;
// 2. Set regexp value to the empty string.
regexp_value = String {};
}
// 13. Let name be the empty string.
// NOTE: Next, we determine the part name. This can be explicitly provided by a "name" token or be automatically assigned.
String name;
// 14. If name token is not null, then set name to name tokens value.
if (name_token.has_value()) {
name = name_token->value;
}
// 15. Otherwise if regexp or wildcard token is not null:
else if (regexp_or_wildcard_token.has_value()) {
// 1. Set name to parsers next numeric name, serialized.
name = String::number(m_next_numeric_name);
// 2. Increment parsers next numeric name by 1.
++m_next_numeric_name;
}
// 16. If the result of running is a duplicate name given parser and name is true, then throw a TypeError.
if (is_a_duplicate_name(name))
return ErrorInfo { MUST(String::formatted("Duplicate name '{}' provided in URL pattern", name)) };
// 17. Let encoded prefix be the result of running parsers encoding callback given prefix.
// NOTE: Finally, we encode the fixed text values and create the part.
auto encoded_prefix = TRY(m_encoding_callback(prefix));
// 18. Let encoded suffix be the result of running parsers encoding callback given suffix.
auto encoded_suffix = TRY(m_encoding_callback(suffix));
// 19. Let part be a new part whose type is type, value is regexp value, modifier is modifier, name is name, prefix
// is encoded prefix, and suffix is encoded suffix.
// 20. Append part to parsers part list.
m_part_list.append({ type, move(regexp_value), modifier, move(name), move(encoded_prefix), move(encoded_suffix) });
return {};
}
// https://urlpattern.spec.whatwg.org/#try-to-consume-a-modifier-token
Optional<Token const&> PatternParser::try_to_consume_a_modifier_token()
{
// 1. Let token be the result of running try to consume a token given parser and "other-modifier".
auto token = try_to_consume_a_token(Token::Type::OtherModifier);
// 2. If token is not null, then return token.
if (token.has_value())
return token;
// 3. Set token to the result of running try to consume a token given parser and "asterisk".
token = try_to_consume_a_token(Token::Type::Asterisk);
// 4. Return token.
return token;
}
// https://urlpattern.spec.whatwg.org/#try-to-consume-a-regexp-or-wildcard-token
Optional<Token const&> PatternParser::try_to_consume_a_regexp_or_wildcard_token(Optional<Token const&> name_token)
{
// 1. Let token be the result of running try to consume a token given parser and "regexp".
auto token = try_to_consume_a_token(Token::Type::Regexp);
// 2. If name token is null and token is null, then set token to the result of running try to consume a token given
// parser and "asterisk".
if (!name_token.has_value() && !token.has_value())
token = try_to_consume_a_token(Token::Type::Asterisk);
// 3. Return token.
return token;
}
// https://urlpattern.spec.whatwg.org/#try-to-consume-a-token
Optional<Token const&> PatternParser::try_to_consume_a_token(Token::Type type)
{
// 1. Assert: parsers index is less than parsers token list size.
VERIFY(m_index < m_token_list.size());
// 2. Let next token be parsers token list[parsers index].
auto const& next_token = m_token_list[m_index];
// 3. If next tokens type is not type return null.
if (next_token.type != type)
return {};
// 4. Increment parsers index by 1.
++m_index;
// 5. Return next token.
return next_token;
}
// https://urlpattern.spec.whatwg.org/#parse-a-pattern-string
PatternErrorOr<Vector<Part>> PatternParser::parse(Utf8View const& input, Options const& options, EncodingCallback encoding_callback)
{
// 1. Let parser be a new pattern parser whose encoding callback is encoding callback and segment wildcard regexp
// is the result of running generate a segment wildcard regexp given options.
PatternParser parser { move(encoding_callback), generate_a_segment_wildcard_regexp(options) };
// 2. Set parsers token list to the result of running tokenize given input and "strict".
parser.m_token_list = TRY(Tokenizer::tokenize(input, Tokenizer::Policy::Strict));
// 3. While parsers index is less than parsers token list's size:
while (parser.m_index < parser.m_token_list.size()) {
// 1. Let char token be the result of running try to consume a token given parser and "char".
auto char_token = parser.try_to_consume_a_token(Token::Type::Char);
// 2. Let name token be the result of running try to consume a token given parser and "name".
auto name_token = parser.try_to_consume_a_token(Token::Type::Name);
// 3. Let regexp or wildcard token be the result of running try to consume a regexp or wildcard token given
// parser and name token.
auto regexp_or_wildcard_token = parser.try_to_consume_a_regexp_or_wildcard_token(name_token);
// 4. If name token is not null or regexp or wildcard token is not null:
// NOTE: If there is a matching group, we need to add the part immediately.
if (name_token.has_value() || regexp_or_wildcard_token.has_value()) {
// 1. Let prefix be the empty string.
String prefix;
// 2. If char token is not null then set prefix to char tokens value.
if (char_token.has_value())
prefix = char_token->value;
// 3. If prefix is not the empty string and not optionss prefix code point:
if (!prefix.is_empty() && options.prefix_code_point.has_value() && prefix != String::from_code_point(*options.prefix_code_point)) {
// 1. Append prefix to the end of parsers pending fixed value.
parser.m_pending_fixed_value.append(prefix);
// 2. Set prefix to the empty string.
prefix = String {};
}
// 4. Run maybe add a part from the pending fixed value given parser.
TRY(parser.maybe_add_a_part_from_the_pending_fixed_value());
// 5. Let modifier token be the result of running try to consume a modifier token given parser.
auto modifier_token = parser.try_to_consume_a_modifier_token();
// 6. Run add a part given parser, prefix, name token, regexp or wildcard token, the empty string,
// and modifier token.
TRY(parser.add_a_part(prefix, name_token, regexp_or_wildcard_token, String {}, modifier_token));
// 7. Continue.
continue;
}
// 5. Let fixed token be char token.
// NOTE: If there was no matching group, then we need to buffer any fixed text. We want to collect as
// much text as possible before adding it as a "fixed-text" part.
auto fixed_token = char_token;
// 6. If fixed token is null, then set fixed token to the result of running try to consume a token given
// parser and "escaped-char".
if (!fixed_token.has_value())
fixed_token = parser.try_to_consume_a_token(Token::Type::EscapedChar);
// 7. If fixed token is not null:
if (fixed_token.has_value()) {
// 1. Append fixed tokens value to parsers pending fixed value.
parser.m_pending_fixed_value.append(fixed_token->value);
// 2. Continue.
continue;
}
// 8. Let open token be the result of running try to consume a token given parser and "open".
auto open_token = parser.try_to_consume_a_token(Token::Type::Open);
// 9. If open token is not null:
if (open_token.has_value()) {
// 1. Let prefix be the result of running consume text given parser.
auto prefix = parser.consume_text();
// 2. Set name token to the result of running try to consume a token given parser and "name".
name_token = parser.try_to_consume_a_token(Token::Type::Name);
// 3. Set regexp or wildcard token to the result of running try to consume a regexp or wildcard token
// given parser and name token.
regexp_or_wildcard_token = parser.try_to_consume_a_regexp_or_wildcard_token(name_token);
// 4. Let suffix be the result of running consume text given parser.
auto suffix = parser.consume_text();
// 5. Run consume a required token given parser and "close".
TRY(parser.consume_a_required_token(Token::Type::Close));
// 6. Let modifier token to the result of running try to consume a modifier token given parser.
auto modifier_token = parser.try_to_consume_a_modifier_token();
// 7. Run add a part given parser, prefix, name token, regexp or wildcard token, suffix, and modifier token.
TRY(parser.add_a_part(prefix, name_token, regexp_or_wildcard_token, suffix, modifier_token));
// 8. Continue.
continue;
}
// 10. Run maybe add a part from the pending fixed value given parser.
TRY(parser.maybe_add_a_part_from_the_pending_fixed_value());
// 11. Run consume a required token given parser and "end".
TRY(parser.consume_a_required_token(Token::Type::End));
}
if constexpr (URL_PATTERN_DEBUG) {
dbgln("Pattern parser produced the part list:");
for (auto const& part : parser.m_part_list) {
dbgln("Type {}, Value '{}', Modifier {}, Name '{}', Prefix '{}', Suffix '{}'",
Part::type_to_string(part.type), part.value, Part::convert_modifier_to_string(part.modifier),
part.name, part.prefix, part.suffix);
}
}
// 4. Return parsers part list.
return move(parser.m_part_list);
}
}

View file

@ -0,0 +1,70 @@
/*
* Copyright (c) 2025, Shannon Booth <shannon@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Function.h>
#include <LibURL/Pattern/Options.h>
#include <LibURL/Pattern/Part.h>
#include <LibURL/Pattern/PatternError.h>
#include <LibURL/Pattern/Tokenizer.h>
namespace URL::Pattern {
// https://urlpattern.spec.whatwg.org/#pattern-parser
class PatternParser {
public:
// https://urlpattern.spec.whatwg.org/#encoding-callback
// An encoding callback is an abstract algorithm that takes a given string input. The input will be a simple text
// piece of a pattern string. An implementing algorithm will validate and encode the input. It must return the
// encoded string or throw an exception.
using EncodingCallback = Function<PatternErrorOr<String>(String const&)>;
static PatternErrorOr<Vector<Part>> parse(Utf8View const& input, Options const&, EncodingCallback);
private:
PatternParser(EncodingCallback, String segment_wildcard_regexp);
Optional<Token const&> try_to_consume_a_token(Token::Type);
Optional<Token const&> try_to_consume_a_modifier_token();
Optional<Token const&> try_to_consume_a_regexp_or_wildcard_token(Optional<Token const&> name_token);
PatternErrorOr<void> consume_a_required_token(Token::Type);
String consume_text();
PatternErrorOr<void> maybe_add_a_part_from_the_pending_fixed_value();
PatternErrorOr<void> add_a_part(String const& prefix, Optional<Token const&> name_token,
Optional<Token const&> regexp_or_wildcard_token, String const& suffix, Optional<Token const&> modifier_token);
bool is_a_duplicate_name(String const&) const;
// https://urlpattern.spec.whatwg.org/#pattern-parser-token-list
// A pattern parser has an associated token list, a token list, initially an empty list.
Vector<Token> m_token_list;
// https://urlpattern.spec.whatwg.org/#pattern-parser-encoding-callback
// A pattern parser has an associated encoding callback, a encoding callback, that must be set upon creation.
EncodingCallback m_encoding_callback;
// https://urlpattern.spec.whatwg.org/#pattern-parser-segment-wildcard-regexp
// A pattern parser has an associated segment wildcard regexp, a string, that must be set upon creation.
String m_segment_wildcard_regexp;
// https://urlpattern.spec.whatwg.org/#pattern-parser-part-list
// A pattern parser has an associated part list, a part list, initially an empty list.
Vector<Part> m_part_list;
// https://urlpattern.spec.whatwg.org/#pattern-parser-pending-fixed-value
// A pattern parser has an associated pending fixed value, a string, initially the empty string.
StringBuilder m_pending_fixed_value;
// https://urlpattern.spec.whatwg.org/#pattern-parser-index
// A pattern parser has an associated index, a number, initially 0.
size_t m_index { 0 };
// https://urlpattern.spec.whatwg.org/#pattern-parser-next-numeric-name
// A pattern parser has an associated next numeric name, a number, initially 0.
size_t m_next_numeric_name { 0 };
};
}