ladybird/Libraries/LibURL/Pattern/PatternParser.cpp
Shannon Booth 0a58497ab9 LibURL/Pattern: Fix PatternParser logic for prefix codepoint comparison
We were not properly handling the case that prefix code point was the
empty string (which we represent as an OptionalNone). While this
still resulted in the correct pattern string being generated, an
incorrect regular expression was being generated causing matching
to fail.
2025-04-07 10:29:09 -04:00

405 lines
17 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (c) 2025, Shannon Booth <shannon@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibURL/Pattern/Component.h>
#include <LibURL/Pattern/PatternParser.h>
#include <LibURL/Pattern/String.h>
namespace URL::Pattern {
PatternParser::PatternParser(EncodingCallback encoding_callback, String segment_wildcard_regexp)
: m_encoding_callback(move(encoding_callback))
, m_segment_wildcard_regexp(move(segment_wildcard_regexp))
{
}
// https://urlpattern.spec.whatwg.org/#consume-a-required-token
PatternErrorOr<void> PatternParser::consume_a_required_token(Token::Type type)
{
// 1. Let result be the result of running try to consume a token given parser and type.
auto result = try_to_consume_a_token(type);
// 2. If result is null, then throw a TypeError.
if (!result.has_value())
return ErrorInfo { MUST(String::formatted("Missing required token '{}' in URL pattern", Token::type_to_string(type))) };
// 3. Return result.
// NOTE: No caller actually needs the result, so we just ignore it.
return {};
}
// https://urlpattern.spec.whatwg.org/#consume-text
String PatternParser::consume_text()
{
// 1. Let result be the empty string.
StringBuilder result;
// 1. While true:
while (true) {
// 1. Let token be the result of running try to consume a token given parser and "char".
auto token = try_to_consume_a_token(Token::Type::Char);
// 2. If token is null, then set token to the result of running try to consume a token given parser and "escaped-char".
if (!token.has_value())
token = try_to_consume_a_token(Token::Type::EscapedChar);
// 3. If token is null, then break.
if (!token.has_value())
break;
// 4. Append tokens value to the end of result.
result.append(token->value);
}
// 2. Return result.
return result.to_string_without_validation();
}
// https://urlpattern.spec.whatwg.org/#maybe-add-a-part-from-the-pending-fixed-value
PatternErrorOr<void> PatternParser::maybe_add_a_part_from_the_pending_fixed_value()
{
// 1. If parsers pending fixed value is the empty string, then return.
if (m_pending_fixed_value.is_empty())
return {};
// 2. Let encoded value be the result of running parsers encoding callback given parsers pending fixed value.
auto encoded_value = TRY(m_encoding_callback(m_pending_fixed_value.to_string_without_validation()));
// 3. Set parsers pending fixed value to the empty string.
m_pending_fixed_value.clear();
// 4. Let part be a new part whose type is "fixed-text", value is encoded value, and modifier is "none".
// 5. Append part to parsers part list.
m_part_list.append({ Part::Type::FixedText, move(encoded_value), Part::Modifier::None });
return {};
}
// https://urlpattern.spec.whatwg.org/#is-a-duplicate-name
bool PatternParser::is_a_duplicate_name(String const& name) const
{
// 1. For each part of parsers part list:
for (auto const& part : m_part_list) {
// 1. If parts name is name, then return true.
if (part.name == name)
return true;
}
// 2. Return false.
return false;
}
// https://urlpattern.spec.whatwg.org/#add-a-part
PatternErrorOr<void> PatternParser::add_a_part(String const& prefix, Optional<Token const&> name_token,
Optional<Token const&> regexp_or_wildcard_token, String const& suffix, Optional<Token const&> modifier_token)
{
// 1. Let modifier be "none".
auto modifier = Part::Modifier::None;
// 2. If modifier token is not null:
if (modifier_token.has_value()) {
// 1. If modifier tokens value is "?" then set modifier to "optional".
if (modifier_token->value == "?"sv) {
modifier = Part::Modifier::Optional;
}
// 2. Otherwise if modifier tokens value is "*" then set modifier to "zero-or-more".
else if (modifier_token->value == "*"sv) {
modifier = Part::Modifier::ZeroOrMore;
}
// 3. Otherwise if modifier tokens value is "+" then set modifier to "one-or-more".
else if (modifier_token->value == "+"sv) {
modifier = Part::Modifier::OneOrMore;
}
}
// 3. If name token is null and regexp or wildcard token is null and modifier is "none":
// NOTE: This was a "{foo}" grouping. We add this to the pending fixed value so that it will be combined with
// any previous or subsequent text.
if (!name_token.has_value() && !regexp_or_wildcard_token.has_value() && modifier == Part::Modifier::None) {
// 1. Append prefix to the end of parsers pending fixed value.
m_pending_fixed_value.append(prefix);
// 2. Return.
return {};
}
// 4. Run maybe add a part from the pending fixed value given parser.
TRY(maybe_add_a_part_from_the_pending_fixed_value());
// 5. If name token is null and regexp or wildcard token is null:
// NOTE: This was a "{foo}?" grouping. The modifier means we cannot combine it with other text. Therefore we
// add it as a part immediately.
if (!name_token.has_value() && !regexp_or_wildcard_token.has_value()) {
// 1. Assert: suffix is the empty string.
VERIFY(suffix.is_empty());
// 2. If prefix is the empty string, then return.
if (prefix.is_empty())
return {};
// 3. Let encoded value be the result of running parsers encoding callback given prefix.
auto encoded_value = TRY(m_encoding_callback(prefix));
// 4. Let part be a new part whose type is "fixed-text", value is encoded value, and modifier is modifier.
// 5. Append part to parsers part list.
m_part_list.append({ Part::Type::FixedText, move(encoded_value), modifier });
// 6. Return.
return {};
}
// 6. Let regexp value be the empty string.
// NOTE: Next, we convert the regexp or wildcard token into a regular expression.
String regexp_value;
// 7. If regexp or wildcard token is null, then set regexp value to parsers segment wildcard regexp.
if (!regexp_or_wildcard_token.has_value()) {
regexp_value = m_segment_wildcard_regexp;
}
// 8. Otherwise if regexp or wildcard tokens type is "asterisk", then set regexp value to the full wildcard regexp value.
else if (regexp_or_wildcard_token->type == Token::Type::Asterisk) {
regexp_value = MUST(String::from_utf8(full_wildcard_regexp_value));
}
// 9. Otherwise set regexp value to regexp or wildcard tokens value.
else {
regexp_value = regexp_or_wildcard_token->value;
}
// 10. Let type be "regexp".
// NOTE: Next, we convert regexp value into a part type. We make sure to go to a regular expression first so
// that an equivalent "regexp" token will be treated the same as a "name" or "asterisk" token.
auto type = Part::Type::Regexp;
// 11. If regexp value is parsers segment wildcard regexp:
if (regexp_value == m_segment_wildcard_regexp) {
// 1. Set type to "segment-wildcard".
type = Part::Type::SegmentWildcard;
// 2. Set regexp value to the empty string.
regexp_value = String {};
}
// 12. Otherwise if regexp value is the full wildcard regexp value:
else if (regexp_value == full_wildcard_regexp_value) {
// 1. Set type to "full-wildcard".
type = Part::Type::FullWildcard;
// 2. Set regexp value to the empty string.
regexp_value = String {};
}
// 13. Let name be the empty string.
// NOTE: Next, we determine the part name. This can be explicitly provided by a "name" token or be automatically assigned.
String name;
// 14. If name token is not null, then set name to name tokens value.
if (name_token.has_value()) {
name = name_token->value;
}
// 15. Otherwise if regexp or wildcard token is not null:
else if (regexp_or_wildcard_token.has_value()) {
// 1. Set name to parsers next numeric name, serialized.
name = String::number(m_next_numeric_name);
// 2. Increment parsers next numeric name by 1.
++m_next_numeric_name;
}
// 16. If the result of running is a duplicate name given parser and name is true, then throw a TypeError.
if (is_a_duplicate_name(name))
return ErrorInfo { MUST(String::formatted("Duplicate name '{}' provided in URL pattern", name)) };
// 17. Let encoded prefix be the result of running parsers encoding callback given prefix.
// NOTE: Finally, we encode the fixed text values and create the part.
auto encoded_prefix = TRY(m_encoding_callback(prefix));
// 18. Let encoded suffix be the result of running parsers encoding callback given suffix.
auto encoded_suffix = TRY(m_encoding_callback(suffix));
// 19. Let part be a new part whose type is type, value is regexp value, modifier is modifier, name is name, prefix
// is encoded prefix, and suffix is encoded suffix.
// 20. Append part to parsers part list.
m_part_list.append({ type, move(regexp_value), modifier, move(name), move(encoded_prefix), move(encoded_suffix) });
return {};
}
// https://urlpattern.spec.whatwg.org/#try-to-consume-a-modifier-token
Optional<Token const&> PatternParser::try_to_consume_a_modifier_token()
{
// 1. Let token be the result of running try to consume a token given parser and "other-modifier".
auto token = try_to_consume_a_token(Token::Type::OtherModifier);
// 2. If token is not null, then return token.
if (token.has_value())
return token;
// 3. Set token to the result of running try to consume a token given parser and "asterisk".
token = try_to_consume_a_token(Token::Type::Asterisk);
// 4. Return token.
return token;
}
// https://urlpattern.spec.whatwg.org/#try-to-consume-a-regexp-or-wildcard-token
Optional<Token const&> PatternParser::try_to_consume_a_regexp_or_wildcard_token(Optional<Token const&> name_token)
{
// 1. Let token be the result of running try to consume a token given parser and "regexp".
auto token = try_to_consume_a_token(Token::Type::Regexp);
// 2. If name token is null and token is null, then set token to the result of running try to consume a token given
// parser and "asterisk".
if (!name_token.has_value() && !token.has_value())
token = try_to_consume_a_token(Token::Type::Asterisk);
// 3. Return token.
return token;
}
// https://urlpattern.spec.whatwg.org/#try-to-consume-a-token
Optional<Token const&> PatternParser::try_to_consume_a_token(Token::Type type)
{
// 1. Assert: parsers index is less than parsers token list size.
VERIFY(m_index < m_token_list.size());
// 2. Let next token be parsers token list[parsers index].
auto const& next_token = m_token_list[m_index];
// 3. If next tokens type is not type return null.
if (next_token.type != type)
return {};
// 4. Increment parsers index by 1.
++m_index;
// 5. Return next token.
return next_token;
}
// https://urlpattern.spec.whatwg.org/#parse-a-pattern-string
PatternErrorOr<Vector<Part>> PatternParser::parse(Utf8View const& input, Options const& options, EncodingCallback encoding_callback)
{
// 1. Let parser be a new pattern parser whose encoding callback is encoding callback and segment wildcard regexp
// is the result of running generate a segment wildcard regexp given options.
PatternParser parser { move(encoding_callback), generate_a_segment_wildcard_regexp(options) };
// 2. Set parsers token list to the result of running tokenize given input and "strict".
parser.m_token_list = TRY(Tokenizer::tokenize(input, Tokenizer::Policy::Strict));
// 3. While parsers index is less than parsers token list's size:
while (parser.m_index < parser.m_token_list.size()) {
// 1. Let char token be the result of running try to consume a token given parser and "char".
auto char_token = parser.try_to_consume_a_token(Token::Type::Char);
// 2. Let name token be the result of running try to consume a token given parser and "name".
auto name_token = parser.try_to_consume_a_token(Token::Type::Name);
// 3. Let regexp or wildcard token be the result of running try to consume a regexp or wildcard token given
// parser and name token.
auto regexp_or_wildcard_token = parser.try_to_consume_a_regexp_or_wildcard_token(name_token);
// 4. If name token is not null or regexp or wildcard token is not null:
// NOTE: If there is a matching group, we need to add the part immediately.
if (name_token.has_value() || regexp_or_wildcard_token.has_value()) {
// 1. Let prefix be the empty string.
String prefix;
// 2. If char token is not null then set prefix to char tokens value.
if (char_token.has_value())
prefix = char_token->value;
// 3. If prefix is not the empty string and not optionss prefix code point:
if (!prefix.is_empty() && (!options.prefix_code_point.has_value() || prefix != String::from_code_point(*options.prefix_code_point))) {
// 1. Append prefix to the end of parsers pending fixed value.
parser.m_pending_fixed_value.append(prefix);
// 2. Set prefix to the empty string.
prefix = String {};
}
// 4. Run maybe add a part from the pending fixed value given parser.
TRY(parser.maybe_add_a_part_from_the_pending_fixed_value());
// 5. Let modifier token be the result of running try to consume a modifier token given parser.
auto modifier_token = parser.try_to_consume_a_modifier_token();
// 6. Run add a part given parser, prefix, name token, regexp or wildcard token, the empty string,
// and modifier token.
TRY(parser.add_a_part(prefix, name_token, regexp_or_wildcard_token, String {}, modifier_token));
// 7. Continue.
continue;
}
// 5. Let fixed token be char token.
// NOTE: If there was no matching group, then we need to buffer any fixed text. We want to collect as
// much text as possible before adding it as a "fixed-text" part.
auto fixed_token = char_token;
// 6. If fixed token is null, then set fixed token to the result of running try to consume a token given
// parser and "escaped-char".
if (!fixed_token.has_value())
fixed_token = parser.try_to_consume_a_token(Token::Type::EscapedChar);
// 7. If fixed token is not null:
if (fixed_token.has_value()) {
// 1. Append fixed tokens value to parsers pending fixed value.
parser.m_pending_fixed_value.append(fixed_token->value);
// 2. Continue.
continue;
}
// 8. Let open token be the result of running try to consume a token given parser and "open".
auto open_token = parser.try_to_consume_a_token(Token::Type::Open);
// 9. If open token is not null:
if (open_token.has_value()) {
// 1. Let prefix be the result of running consume text given parser.
auto prefix = parser.consume_text();
// 2. Set name token to the result of running try to consume a token given parser and "name".
name_token = parser.try_to_consume_a_token(Token::Type::Name);
// 3. Set regexp or wildcard token to the result of running try to consume a regexp or wildcard token
// given parser and name token.
regexp_or_wildcard_token = parser.try_to_consume_a_regexp_or_wildcard_token(name_token);
// 4. Let suffix be the result of running consume text given parser.
auto suffix = parser.consume_text();
// 5. Run consume a required token given parser and "close".
TRY(parser.consume_a_required_token(Token::Type::Close));
// 6. Let modifier token to the result of running try to consume a modifier token given parser.
auto modifier_token = parser.try_to_consume_a_modifier_token();
// 7. Run add a part given parser, prefix, name token, regexp or wildcard token, suffix, and modifier token.
TRY(parser.add_a_part(prefix, name_token, regexp_or_wildcard_token, suffix, modifier_token));
// 8. Continue.
continue;
}
// 10. Run maybe add a part from the pending fixed value given parser.
TRY(parser.maybe_add_a_part_from_the_pending_fixed_value());
// 11. Run consume a required token given parser and "end".
TRY(parser.consume_a_required_token(Token::Type::End));
}
if constexpr (URL_PATTERN_DEBUG) {
dbgln("Pattern parser produced the part list:");
for (auto const& part : parser.m_part_list) {
dbgln("Type {}, Value '{}', Modifier {}, Name '{}', Prefix '{}', Suffix '{}'",
Part::type_to_string(part.type), part.value, Part::convert_modifier_to_string(part.modifier),
part.name, part.prefix, part.suffix);
}
}
// 4. Return parsers part list.
return move(parser.m_part_list);
}
}