From 934f1ec30d6340b2587c122f24e5aad7be8447f6 Mon Sep 17 00:00:00 2001 From: Shannon Booth Date: Tue, 18 Mar 2025 19:31:46 +1300 Subject: [PATCH] LibURL/Pattern: Implement the URLPattern Pattern Parser --- Libraries/LibURL/CMakeLists.txt | 1 + Libraries/LibURL/Pattern/PatternParser.cpp | 405 +++++++++++++++++++++ Libraries/LibURL/Pattern/PatternParser.h | 70 ++++ 3 files changed, 476 insertions(+) create mode 100644 Libraries/LibURL/Pattern/PatternParser.cpp create mode 100644 Libraries/LibURL/Pattern/PatternParser.h diff --git a/Libraries/LibURL/CMakeLists.txt b/Libraries/LibURL/CMakeLists.txt index 3c097378f8d..bb709e6bda5 100644 --- a/Libraries/LibURL/CMakeLists.txt +++ b/Libraries/LibURL/CMakeLists.txt @@ -13,6 +13,7 @@ set(SOURCES Pattern/Options.cpp Pattern/Part.cpp Pattern/Pattern.cpp + Pattern/PatternParser.cpp Pattern/String.cpp Pattern/Tokenizer.cpp ) diff --git a/Libraries/LibURL/Pattern/PatternParser.cpp b/Libraries/LibURL/Pattern/PatternParser.cpp new file mode 100644 index 00000000000..53e9539f638 --- /dev/null +++ b/Libraries/LibURL/Pattern/PatternParser.cpp @@ -0,0 +1,405 @@ +/* + * Copyright (c) 2025, Shannon Booth + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include + +namespace URL::Pattern { + +PatternParser::PatternParser(EncodingCallback encoding_callback, String segment_wildcard_regexp) + : m_encoding_callback(move(encoding_callback)) + , m_segment_wildcard_regexp(move(segment_wildcard_regexp)) +{ +} + +// https://urlpattern.spec.whatwg.org/#consume-a-required-token +PatternErrorOr PatternParser::consume_a_required_token(Token::Type type) +{ + // 1. Let result be the result of running try to consume a token given parser and type. + auto result = try_to_consume_a_token(type); + + // 2. If result is null, then throw a TypeError. + if (!result.has_value()) + return ErrorInfo { MUST(String::formatted("Missing required token '{}' in URL pattern", Token::type_to_string(type))) }; + + // 3. Return result. + // NOTE: No caller actually needs the result, so we just ignore it. + return {}; +} + +// https://urlpattern.spec.whatwg.org/#consume-text +String PatternParser::consume_text() +{ + // 1. Let result be the empty string. + StringBuilder result; + + // 1. While true: + while (true) { + // 1. Let token be the result of running try to consume a token given parser and "char". + auto token = try_to_consume_a_token(Token::Type::Char); + + // 2. If token is null, then set token to the result of running try to consume a token given parser and "escaped-char". + if (!token.has_value()) + token = try_to_consume_a_token(Token::Type::EscapedChar); + + // 3. If token is null, then break. + if (!token.has_value()) + break; + + // 4. Append token’s value to the end of result. + result.append(token->value); + } + + // 2. Return result. + return result.to_string_without_validation(); +} + +// https://urlpattern.spec.whatwg.org/#maybe-add-a-part-from-the-pending-fixed-value +PatternErrorOr PatternParser::maybe_add_a_part_from_the_pending_fixed_value() +{ + // 1. If parser’s pending fixed value is the empty string, then return. + if (m_pending_fixed_value.is_empty()) + return {}; + + // 2. Let encoded value be the result of running parser’s encoding callback given parser’s pending fixed value. + auto encoded_value = TRY(m_encoding_callback(m_pending_fixed_value.to_string_without_validation())); + + // 3. Set parser’s pending fixed value to the empty string. + m_pending_fixed_value.clear(); + + // 4. Let part be a new part whose type is "fixed-text", value is encoded value, and modifier is "none". + // 5. Append part to parser’s part list. + m_part_list.append({ Part::Type::FixedText, move(encoded_value), Part::Modifier::None }); + + return {}; +} + +// https://urlpattern.spec.whatwg.org/#is-a-duplicate-name +bool PatternParser::is_a_duplicate_name(String const& name) const +{ + // 1. For each part of parser’s part list: + for (auto const& part : m_part_list) { + // 1. If part’s name is name, then return true. + if (part.name == name) + return true; + } + + // 2. Return false. + return false; +} + +// https://urlpattern.spec.whatwg.org/#add-a-part +PatternErrorOr PatternParser::add_a_part(String const& prefix, Optional name_token, + Optional regexp_or_wildcard_token, String const& suffix, Optional modifier_token) +{ + // 1. Let modifier be "none". + auto modifier = Part::Modifier::None; + + // 2. If modifier token is not null: + if (modifier_token.has_value()) { + // 1. If modifier token’s value is "?" then set modifier to "optional". + if (modifier_token->value == "?"sv) { + modifier = Part::Modifier::Optional; + } + // 2. Otherwise if modifier token’s value is "*" then set modifier to "zero-or-more". + else if (modifier_token->value == "*"sv) { + modifier = Part::Modifier::ZeroOrMore; + } + // 3. Otherwise if modifier token’s value is "+" then set modifier to "one-or-more". + else if (modifier_token->value == "+"sv) { + modifier = Part::Modifier::OneOrMore; + } + } + + // 3. If name token is null and regexp or wildcard token is null and modifier is "none": + // NOTE: This was a "{foo}" grouping. We add this to the pending fixed value so that it will be combined with + // any previous or subsequent text. + if (!name_token.has_value() && !regexp_or_wildcard_token.has_value() && modifier == Part::Modifier::None) { + // 1. Append prefix to the end of parser’s pending fixed value. + m_pending_fixed_value.append(prefix); + + // 2. Return. + return {}; + } + + // 4. Run maybe add a part from the pending fixed value given parser. + TRY(maybe_add_a_part_from_the_pending_fixed_value()); + + // 5. If name token is null and regexp or wildcard token is null: + // NOTE: This was a "{foo}?" grouping. The modifier means we cannot combine it with other text. Therefore we + // add it as a part immediately. + if (!name_token.has_value() && !regexp_or_wildcard_token.has_value()) { + // 1. Assert: suffix is the empty string. + VERIFY(suffix.is_empty()); + + // 2. If prefix is the empty string, then return. + if (prefix.is_empty()) + return {}; + + // 3. Let encoded value be the result of running parser’s encoding callback given prefix. + auto encoded_value = TRY(m_encoding_callback(prefix)); + + // 4. Let part be a new part whose type is "fixed-text", value is encoded value, and modifier is modifier. + // 5. Append part to parser’s part list. + m_part_list.append({ Part::Type::FixedText, move(encoded_value), modifier }); + + // 6. Return. + return {}; + } + + // 6. Let regexp value be the empty string. + // NOTE: Next, we convert the regexp or wildcard token into a regular expression. + String regexp_value; + + // 7. If regexp or wildcard token is null, then set regexp value to parser’s segment wildcard regexp. + if (!regexp_or_wildcard_token.has_value()) { + regexp_value = m_segment_wildcard_regexp; + } + // 8. Otherwise if regexp or wildcard token’s type is "asterisk", then set regexp value to the full wildcard regexp value. + else if (regexp_or_wildcard_token->type == Token::Type::Asterisk) { + regexp_value = MUST(String::from_utf8(full_wildcard_regexp_value)); + } + // 9. Otherwise set regexp value to regexp or wildcard token’s value. + else { + regexp_value = regexp_or_wildcard_token->value; + } + + // 10. Let type be "regexp". + // NOTE: Next, we convert regexp value into a part type. We make sure to go to a regular expression first so + // that an equivalent "regexp" token will be treated the same as a "name" or "asterisk" token. + auto type = Part::Type::Regexp; + + // 11. If regexp value is parser’s segment wildcard regexp: + if (regexp_value == m_segment_wildcard_regexp) { + // 1. Set type to "segment-wildcard". + type = Part::Type::SegmentWildcard; + + // 2. Set regexp value to the empty string. + regexp_value = String {}; + } + // 12. Otherwise if regexp value is the full wildcard regexp value: + else if (regexp_value == full_wildcard_regexp_value) { + // 1. Set type to "full-wildcard". + type = Part::Type::FullWildcard; + + // 2. Set regexp value to the empty string. + regexp_value = String {}; + } + + // 13. Let name be the empty string. + // NOTE: Next, we determine the part name. This can be explicitly provided by a "name" token or be automatically assigned. + String name; + + // 14. If name token is not null, then set name to name token’s value. + if (name_token.has_value()) { + name = name_token->value; + } + // 15. Otherwise if regexp or wildcard token is not null: + else if (regexp_or_wildcard_token.has_value()) { + // 1. Set name to parser’s next numeric name, serialized. + name = String::number(m_next_numeric_name); + + // 2. Increment parser’s next numeric name by 1. + ++m_next_numeric_name; + } + + // 16. If the result of running is a duplicate name given parser and name is true, then throw a TypeError. + if (is_a_duplicate_name(name)) + return ErrorInfo { MUST(String::formatted("Duplicate name '{}' provided in URL pattern", name)) }; + + // 17. Let encoded prefix be the result of running parser’s encoding callback given prefix. + // NOTE: Finally, we encode the fixed text values and create the part. + auto encoded_prefix = TRY(m_encoding_callback(prefix)); + + // 18. Let encoded suffix be the result of running parser’s encoding callback given suffix. + auto encoded_suffix = TRY(m_encoding_callback(suffix)); + + // 19. Let part be a new part whose type is type, value is regexp value, modifier is modifier, name is name, prefix + // is encoded prefix, and suffix is encoded suffix. + // 20. Append part to parser’s part list. + m_part_list.append({ type, move(regexp_value), modifier, move(name), move(encoded_prefix), move(encoded_suffix) }); + + return {}; +} + +// https://urlpattern.spec.whatwg.org/#try-to-consume-a-modifier-token +Optional PatternParser::try_to_consume_a_modifier_token() +{ + // 1. Let token be the result of running try to consume a token given parser and "other-modifier". + auto token = try_to_consume_a_token(Token::Type::OtherModifier); + + // 2. If token is not null, then return token. + if (token.has_value()) + return token; + + // 3. Set token to the result of running try to consume a token given parser and "asterisk". + token = try_to_consume_a_token(Token::Type::Asterisk); + + // 4. Return token. + return token; +} + +// https://urlpattern.spec.whatwg.org/#try-to-consume-a-regexp-or-wildcard-token +Optional PatternParser::try_to_consume_a_regexp_or_wildcard_token(Optional name_token) +{ + // 1. Let token be the result of running try to consume a token given parser and "regexp". + auto token = try_to_consume_a_token(Token::Type::Regexp); + + // 2. If name token is null and token is null, then set token to the result of running try to consume a token given + // parser and "asterisk". + if (!name_token.has_value() && !token.has_value()) + token = try_to_consume_a_token(Token::Type::Asterisk); + + // 3. Return token. + return token; +} + +// https://urlpattern.spec.whatwg.org/#try-to-consume-a-token +Optional PatternParser::try_to_consume_a_token(Token::Type type) +{ + // 1. Assert: parser’s index is less than parser’s token list size. + VERIFY(m_index < m_token_list.size()); + + // 2. Let next token be parser’s token list[parser’s index]. + auto const& next_token = m_token_list[m_index]; + + // 3. If next token’s type is not type return null. + if (next_token.type != type) + return {}; + + // 4. Increment parser’s index by 1. + ++m_index; + + // 5. Return next token. + return next_token; +} + +// https://urlpattern.spec.whatwg.org/#parse-a-pattern-string +PatternErrorOr> PatternParser::parse(Utf8View const& input, Options const& options, EncodingCallback encoding_callback) +{ + // 1. Let parser be a new pattern parser whose encoding callback is encoding callback and segment wildcard regexp + // is the result of running generate a segment wildcard regexp given options. + PatternParser parser { move(encoding_callback), generate_a_segment_wildcard_regexp(options) }; + + // 2. Set parser’s token list to the result of running tokenize given input and "strict". + parser.m_token_list = TRY(Tokenizer::tokenize(input, Tokenizer::Policy::Strict)); + + // 3. While parser’s index is less than parser’s token list's size: + while (parser.m_index < parser.m_token_list.size()) { + // 1. Let char token be the result of running try to consume a token given parser and "char". + auto char_token = parser.try_to_consume_a_token(Token::Type::Char); + + // 2. Let name token be the result of running try to consume a token given parser and "name". + auto name_token = parser.try_to_consume_a_token(Token::Type::Name); + + // 3. Let regexp or wildcard token be the result of running try to consume a regexp or wildcard token given + // parser and name token. + auto regexp_or_wildcard_token = parser.try_to_consume_a_regexp_or_wildcard_token(name_token); + + // 4. If name token is not null or regexp or wildcard token is not null: + // NOTE: If there is a matching group, we need to add the part immediately. + if (name_token.has_value() || regexp_or_wildcard_token.has_value()) { + // 1. Let prefix be the empty string. + String prefix; + + // 2. If char token is not null then set prefix to char token’s value. + if (char_token.has_value()) + prefix = char_token->value; + + // 3. If prefix is not the empty string and not options’s prefix code point: + if (!prefix.is_empty() && options.prefix_code_point.has_value() && prefix != String::from_code_point(*options.prefix_code_point)) { + // 1. Append prefix to the end of parser’s pending fixed value. + parser.m_pending_fixed_value.append(prefix); + + // 2. Set prefix to the empty string. + prefix = String {}; + } + + // 4. Run maybe add a part from the pending fixed value given parser. + TRY(parser.maybe_add_a_part_from_the_pending_fixed_value()); + + // 5. Let modifier token be the result of running try to consume a modifier token given parser. + auto modifier_token = parser.try_to_consume_a_modifier_token(); + + // 6. Run add a part given parser, prefix, name token, regexp or wildcard token, the empty string, + // and modifier token. + TRY(parser.add_a_part(prefix, name_token, regexp_or_wildcard_token, String {}, modifier_token)); + + // 7. Continue. + continue; + } + + // 5. Let fixed token be char token. + // NOTE: If there was no matching group, then we need to buffer any fixed text. We want to collect as + // much text as possible before adding it as a "fixed-text" part. + auto fixed_token = char_token; + + // 6. If fixed token is null, then set fixed token to the result of running try to consume a token given + // parser and "escaped-char". + if (!fixed_token.has_value()) + fixed_token = parser.try_to_consume_a_token(Token::Type::EscapedChar); + + // 7. If fixed token is not null: + if (fixed_token.has_value()) { + // 1. Append fixed token’s value to parser’s pending fixed value. + parser.m_pending_fixed_value.append(fixed_token->value); + + // 2. Continue. + continue; + } + + // 8. Let open token be the result of running try to consume a token given parser and "open". + auto open_token = parser.try_to_consume_a_token(Token::Type::Open); + + // 9. If open token is not null: + if (open_token.has_value()) { + // 1. Let prefix be the result of running consume text given parser. + auto prefix = parser.consume_text(); + + // 2. Set name token to the result of running try to consume a token given parser and "name". + name_token = parser.try_to_consume_a_token(Token::Type::Name); + + // 3. Set regexp or wildcard token to the result of running try to consume a regexp or wildcard token + // given parser and name token. + regexp_or_wildcard_token = parser.try_to_consume_a_regexp_or_wildcard_token(name_token); + + // 4. Let suffix be the result of running consume text given parser. + auto suffix = parser.consume_text(); + + // 5. Run consume a required token given parser and "close". + TRY(parser.consume_a_required_token(Token::Type::Close)); + + // 6. Let modifier token to the result of running try to consume a modifier token given parser. + auto modifier_token = parser.try_to_consume_a_modifier_token(); + + // 7. Run add a part given parser, prefix, name token, regexp or wildcard token, suffix, and modifier token. + TRY(parser.add_a_part(prefix, name_token, regexp_or_wildcard_token, suffix, modifier_token)); + + // 8. Continue. + continue; + } + + // 10. Run maybe add a part from the pending fixed value given parser. + TRY(parser.maybe_add_a_part_from_the_pending_fixed_value()); + + // 11. Run consume a required token given parser and "end". + TRY(parser.consume_a_required_token(Token::Type::End)); + } + + if constexpr (URL_PATTERN_DEBUG) { + dbgln("Pattern parser produced the part list:"); + for (auto const& part : parser.m_part_list) { + dbgln("Type {}, Value '{}', Modifier {}, Name '{}', Prefix '{}', Suffix '{}'", + Part::type_to_string(part.type), part.value, Part::convert_modifier_to_string(part.modifier), + part.name, part.prefix, part.suffix); + } + } + // 4. Return parser’s part list. + return move(parser.m_part_list); +} + +} diff --git a/Libraries/LibURL/Pattern/PatternParser.h b/Libraries/LibURL/Pattern/PatternParser.h new file mode 100644 index 00000000000..08e09d7ef25 --- /dev/null +++ b/Libraries/LibURL/Pattern/PatternParser.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2025, Shannon Booth + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include +#include +#include + +namespace URL::Pattern { + +// https://urlpattern.spec.whatwg.org/#pattern-parser +class PatternParser { +public: + // https://urlpattern.spec.whatwg.org/#encoding-callback + // An encoding callback is an abstract algorithm that takes a given string input. The input will be a simple text + // piece of a pattern string. An implementing algorithm will validate and encode the input. It must return the + // encoded string or throw an exception. + using EncodingCallback = Function(String const&)>; + + static PatternErrorOr> parse(Utf8View const& input, Options const&, EncodingCallback); + +private: + PatternParser(EncodingCallback, String segment_wildcard_regexp); + + Optional try_to_consume_a_token(Token::Type); + Optional try_to_consume_a_modifier_token(); + Optional try_to_consume_a_regexp_or_wildcard_token(Optional name_token); + PatternErrorOr consume_a_required_token(Token::Type); + String consume_text(); + PatternErrorOr maybe_add_a_part_from_the_pending_fixed_value(); + PatternErrorOr add_a_part(String const& prefix, Optional name_token, + Optional regexp_or_wildcard_token, String const& suffix, Optional modifier_token); + bool is_a_duplicate_name(String const&) const; + + // https://urlpattern.spec.whatwg.org/#pattern-parser-token-list + // A pattern parser has an associated token list, a token list, initially an empty list. + Vector m_token_list; + + // https://urlpattern.spec.whatwg.org/#pattern-parser-encoding-callback + // A pattern parser has an associated encoding callback, a encoding callback, that must be set upon creation. + EncodingCallback m_encoding_callback; + + // https://urlpattern.spec.whatwg.org/#pattern-parser-segment-wildcard-regexp + // A pattern parser has an associated segment wildcard regexp, a string, that must be set upon creation. + String m_segment_wildcard_regexp; + + // https://urlpattern.spec.whatwg.org/#pattern-parser-part-list + // A pattern parser has an associated part list, a part list, initially an empty list. + Vector m_part_list; + + // https://urlpattern.spec.whatwg.org/#pattern-parser-pending-fixed-value + // A pattern parser has an associated pending fixed value, a string, initially the empty string. + StringBuilder m_pending_fixed_value; + + // https://urlpattern.spec.whatwg.org/#pattern-parser-index + // A pattern parser has an associated index, a number, initially 0. + size_t m_index { 0 }; + + // https://urlpattern.spec.whatwg.org/#pattern-parser-next-numeric-name + // A pattern parser has an associated next numeric name, a number, initially 0. + size_t m_next_numeric_name { 0 }; +}; + +}