LibURL/Pattern: Implement the constructor string parser

This is missing one small bit of functionality where the not-yet
impplemented component compilation is required.
This commit is contained in:
Shannon Booth 2025-03-05 16:41:14 +13:00 committed by Tim Flynn
parent f05c0509c3
commit e369756e9c
Notes: github-actions[bot] 2025-03-15 11:40:11 +00:00
5 changed files with 986 additions and 0 deletions

View file

@ -8,6 +8,7 @@ set(SOURCES
URL.cpp
${PUBLIC_SUFFIX_SOURCES}
Pattern/Canonicalization.cpp
Pattern/ConstructorStringParser.cpp
Pattern/Pattern.cpp
Pattern/Tokenizer.cpp
)

View file

@ -0,0 +1,704 @@
/*
* Copyright (c) 2025, Shannon Booth <shannon@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Debug.h>
#include <AK/GenericShorthands.h>
#include <LibURL/Pattern/Canonicalization.h>
#include <LibURL/Pattern/Component.h>
#include <LibURL/Pattern/ConstructorStringParser.h>
namespace URL::Pattern {
StringView ConstructorStringParser::state_to_string() const
{
switch (m_state) {
case State::Initial:
return "Initial"sv;
case State::Protocol:
return "Protocol"sv;
case State::Authority:
return "Authority"sv;
case State::Username:
return "Username"sv;
case State::Password:
return "Password"sv;
case State::Hostname:
return "Hostname"sv;
case State::Port:
return "Port"sv;
case State::Pathname:
return "Pathname"sv;
case State::Search:
return "Search"sv;
case State::Hash:
return "Hash"sv;
case State::Done:
return "Done"sv;
}
VERIFY_NOT_REACHED();
}
ConstructorStringParser::ConstructorStringParser(Utf8View const& input, Vector<Token> token_list)
: m_input(input)
, m_token_list(move(token_list))
{
}
// https://urlpattern.spec.whatwg.org/#parse-a-constructor-string
PatternErrorOr<Init> ConstructorStringParser::parse(Utf8View const& input)
{
// 1. Let parser be a new constructor string parser whose input is input and token list is the result of running
// tokenize given input and "lenient".
ConstructorStringParser parser { input, TRY(Tokenizer::tokenize(input, Tokenizer::Policy::Lenient)) };
// 2. While parsers token index is less than parsers token list size:
while (parser.m_token_index < parser.m_token_list.size()) {
dbgln_if(URL_PATTERN_DEBUG, "{}\t| Token@{} (group depth {}) -> {}", parser.state_to_string(),
parser.m_token_index, parser.m_group_depth, parser.m_token_list[parser.m_token_index].to_string());
// 1. Set parsers token increment to 1.
parser.m_token_increment = 1;
// NOTE: On every iteration of the parse loop the parsers token index will be incremented by its token
// increment value. Typically this means incrementing by 1, but at certain times it is set to zero.
// The token increment is then always reset back to 1 at the top of the loop.
// 2. If parsers token list[parsers token index]'s type is "end" then:
if (parser.m_token_list[parser.m_token_index].type == Token::Type::End) {
// 1. If parsers state is "init":
if (parser.m_state == State::Initial) {
// NOTE: If we reached the end of the string in the "init" state, then we failed to find a protocol
// terminator and this has to be a relative URLPattern constructor string.
// 1. Run rewind given parser.
parser.rewind();
// NOTE: We next determine at which component the relative pattern begins. Relative pathnames are
// most common, but URLs and URLPattern constructor strings can begin with the search or hash
// components as well.
// 2. If the result of running is a hash prefix given parser is true, then run change state given parser,
// "hash" and 1.
if (parser.is_a_hash_prefix()) {
parser.change_state(State::Hash, 1);
}
// 3. Otherwise if the result of running is a search prefix given parser is true:
else if (parser.is_a_search_prefix()) {
// 1. Run change state given parser, "search" and 1.
parser.change_state(State::Search, 1);
}
// 4. Otherwise:
else {
// 1. Run change state given parser, "pathname" and 0.
parser.change_state(State::Pathname, 0);
}
// 5. Increment parsers token index by parsers token increment.
parser.m_token_index += parser.m_token_increment;
// 6. Continue.
continue;
}
// 2. If parsers state is "authority":
if (parser.m_state == State::Authority) {
// NOTE: If we reached the end of the string in the "authority" state, then we failed to find an
// "@". Therefore there is no username or password.
// 1. Run rewind and set state given parser, and "hostname".
parser.rewind_and_set_state(State::Hostname);
// 2. Increment parsers token index by parsers token increment.
parser.m_token_index += parser.m_token_increment;
// 3. Continue.
continue;
}
// 3. Run change state given parser, "done" and 0.
parser.change_state(State::Done, 0);
// 4. Break.
break;
}
// 3. If the result of running is a group open given parser is true:
if (parser.is_a_group_open()) {
// NOTE: We ignore all code points within "{ ... }" pattern groupings. It would not make sense to allow
// a URL component boundary to lie within a grouping; e.g. "https://example.c{om/fo}o". While not
// supported within well formed pattern strings, we handle nested groupings here to avoid parser
// confusion.
//
// It is not necessary to perform this logic for regexp or named groups since those values are collapsed into
// individual tokens by the tokenize algorithm.
// 1. Increment parsers group depth by 1.
++parser.m_group_depth;
// 2. Increment parsers token index by parsers token increment.
parser.m_token_index += parser.m_token_increment;
// 3. Continue.
continue;
}
// 4. If parsers group depth is greater than 0:
if (parser.m_group_depth > 0) {
// 1. If the result of running is a group close given parser is true, then decrement parsers group depth by 1.
if (parser.is_a_group_close()) {
VERIFY(parser.m_group_depth != 0);
--parser.m_group_depth;
}
// 2. Otherwise:
else {
// 1. Increment parsers token index by parsers token increment.
parser.m_token_index += parser.m_token_increment;
// 2. Continue.
continue;
}
}
// 5. Switch on parsers state and run the associated steps:
switch (parser.m_state) {
// -> "init", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-init%E2%91%A2
case State::Initial: {
// 1. If the result of running is a protocol suffix given parser is true:
if (parser.is_a_protocol_suffix()) {
// 1. Run rewind and set state given parser and "protocol".
parser.rewind_and_set_state(State::Protocol);
}
break;
}
// -> "protocol", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-protocol%E2%91%A0
case State::Protocol: {
// 1. If the result of running is a protocol suffix given parser is true:
if (parser.is_a_protocol_suffix()) {
// 1. Run compute protocol matches a special scheme flag given parser.
TRY(parser.compute_protocol_matches_a_special_scheme_flag());
// NOTE: We need to eagerly compile the protocol component to determine if it matches any special
// schemes. If it does then certain special rules apply. It determines if the pathname
// defaults to a "/" and also whether we will look for the username, password, hostname, and
// port components. Authority slashes can also cause us to look for these components as well.
// Otherwise we treat this as an "opaque path URL" and go straight to the pathname component.
// 2. Let next state be "pathname".
auto next_state = State::Pathname;
// 3. Let skip be 1.
u32 skip = 1;
// 4. If the result of running next is authority slashes given parser is true:
if (parser.next_is_authority_slashes()) {
// 1. Set next state to "authority".
next_state = State::Authority;
// 2. Set skip to 3.
skip = 3;
}
// 5. Otherwise if parsers protocol matches a special scheme flag is true, then set next state to "authority".
else if (parser.m_protocol_matches_a_special_scheme) {
next_state = State::Authority;
}
// 6. Run change state given parser, next state, and skip.
parser.change_state(next_state, skip);
}
break;
}
// -> "authority", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-authority%E2%91%A3
case State::Authority: {
// 1. If the result of running is an identity terminator given parser is true, then run rewind and set state
// given parser and "username".
if (parser.is_an_identity_terminator()) {
parser.rewind_and_set_state(State::Username);
}
// 2. Otherwise if any of the following are true:
// * the result of running is a pathname start given parser;
// * the result of running is a search prefix given parser; or
// * the result of running is a hash prefix given parser,
// then run rewind and set state given parser and "hostname".
else if (parser.is_a_pathname_start()
|| parser.is_a_search_prefix()
|| parser.is_a_hash_prefix()) {
parser.rewind_and_set_state(State::Hostname);
}
break;
}
// -> "username", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-username%E2%91%A0
case State::Username: {
// 1. If the result of running is a password prefix given parser is true, then run change state given
// parser, "password", and 1.
if (parser.is_a_password_prefix()) {
parser.change_state(State::Password, 1);
}
// 2. Otherwise if the result of running is an identity terminator given parser is true, then run change
// state given parser, "hostname", and 1.
else if (parser.is_an_identity_terminator()) {
parser.change_state(State::Hostname, 1);
}
break;
}
// -> "password", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-password%E2%91%A0
case State::Password: {
// 1. If the result of running is an identity terminator given parser is true, then run change state
// given parser, "hostname", and 1.
if (parser.is_an_identity_terminator())
parser.change_state(State::Hostname, 1);
break;
}
// -> "hostname", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-hostname%E2%91%A3
case State::Hostname: {
// 1. If the result of running is an IPv6 open given parser is true, then increment parsers hostname
// IPv6 bracket depth by 1.
if (parser.is_an_ipv6_open()) {
++parser.m_hostname_ipv6_bracket_depth;
}
// 2. Otherwise if the result of running is an IPv6 close given parser is true, then decrement parsers
// hostname IPv6 bracket depth by 1.
else if (parser.is_an_ipv6_close()) {
VERIFY(parser.m_hostname_ipv6_bracket_depth != 0);
--parser.m_hostname_ipv6_bracket_depth;
}
// 3. Otherwise if the result of running is a port prefix given parser is true and parsers hostname IPv6
// bracket depth is zero, then run change state given parser, "port", and 1.
else if (parser.is_a_port_prefix() && parser.m_hostname_ipv6_bracket_depth == 0) {
parser.change_state(State::Port, 1);
}
// 4. Otherwise if the result of running is a pathname start given parser is true, then run change state
// given parser, "pathname", and 0.
else if (parser.is_a_pathname_start()) {
parser.change_state(State::Pathname, 0);
}
// 5. Otherwise if the result of running is a search prefix given parser is true, then run change state
// given parser, "search", and 1.
else if (parser.is_a_search_prefix()) {
parser.change_state(State::Search, 1);
}
// 6. Otherwise if the result of running is a hash prefix given parser is true, then run change state
// given parser, "hash", and 1.
else if (parser.is_a_hash_prefix()) {
parser.change_state(State::Hash, 1);
}
break;
}
// -> "port", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-port%E2%91%A0
case State::Port: {
// 1. If the result of running is a pathname start given parser is true, then run change state given
// parser, "pathname", and 0.
if (parser.is_a_pathname_start()) {
parser.change_state(State::Pathname, 0);
}
// 2. Otherwise if the result of running is a search prefix given parser is true, then run change state
// given parser, "search", and 1.
else if (parser.is_a_search_prefix()) {
parser.change_state(State::Search, 1);
}
// 3. Otherwise if the result of running is a hash prefix given parser is true, then run change state given
// parser, "hash", and 1.
else if (parser.is_a_hash_prefix()) {
parser.change_state(State::Hash, 1);
}
break;
}
// -> "pathname", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-pathname%E2%91%A3
case State::Pathname: {
// 1. If the result of running is a search prefix given parser is true, then run change state given parser,
// "search", and 1.
if (parser.is_a_search_prefix()) {
parser.change_state(State::Search, 1);
}
// 2. Otherwise if the result of running is a hash prefix given parser is true, then run change state given
// parser, "hash", and 1.
else if (parser.is_a_hash_prefix()) {
parser.change_state(State::Hash, 1);
}
break;
}
// -> "search", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-search%E2%91%A3
case State::Search: {
// 1. If the result of running is a hash prefix given parser is true, then run change state given parser,
// "hash", and 1.
if (parser.is_a_hash_prefix())
parser.change_state(State::Hash, 1);
break;
}
// -> "hash", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-hash%E2%91%A4
case State::Hash: {
// 1. Do nothing.
break;
}
// -> "done", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-done%E2%91%A0
case State::Done: {
// 1. Assert: This step is never reached.
VERIFY_NOT_REACHED();
}
}
// 6. Increment parsers token index by parsers token increment.
parser.m_token_index += parser.m_token_increment;
}
// 3. If parsers result contains "hostname" and not "port", then set parsers result["port"] to the empty string.
if (parser.m_result.hostname.has_value() && !parser.m_result.port.has_value())
parser.m_result.port = String {};
// NOTE: This is special-cased because when an author does not specify a port, they usually intend the default
// port. If any port is acceptable, the author can specify it as a wildcard explicitly. For example,
// "https://example.com/*" does not match URLs beginning with "https://example.com:8443/", which is a
// different origin.
// 4. Return parsers result.
return parser.m_result;
}
// https://urlpattern.spec.whatwg.org/#make-a-component-string
String ConstructorStringParser::make_a_component_string() const
{
// 1. Assert: parsers token index is less than parsers token list's size.
VERIFY(m_token_index < m_token_list.size());
// 2. Let token be parsers token list[parsers token index].
auto const& token = m_token_list[m_token_index];
// 3. Let component start token be the result of running get a safe token given parser and parsers component start.
auto const& component_start_token = get_a_safe_token(m_component_start);
// 4. Let component start input index be component start tokens index.
auto component_start_input_index = component_start_token.index;
// 5. Let end index be tokens index.
auto end_index = token.index;
// 6. Return the code point substring from component start input index to end index within parsers input.
auto sub_view = m_input.unicode_substring_view(component_start_input_index, end_index - component_start_input_index);
return MUST(String::from_utf8(sub_view.as_string()));
}
// https://urlpattern.spec.whatwg.org/#compute-protocol-matches-a-special-scheme-flag
PatternErrorOr<void> ConstructorStringParser::compute_protocol_matches_a_special_scheme_flag()
{
// FIXME: Implement this.
return {};
}
Optional<String> const& ConstructorStringParser::result_for_active_state() const
{
switch (m_state) {
case State::Protocol:
return m_result.protocol;
case State::Username:
return m_result.username;
case State::Password:
return m_result.password;
case State::Hostname:
return m_result.hostname;
case State::Port:
return m_result.port;
case State::Pathname:
return m_result.pathname;
case State::Search:
return m_result.search;
case State::Hash:
return m_result.hash;
case State::Initial:
case State::Authority:
case State::Done:
break;
}
VERIFY_NOT_REACHED();
}
void ConstructorStringParser::set_result_for_active_state(Optional<String> value)
{
switch (m_state) {
case State::Protocol:
m_result.protocol = move(value);
break;
case State::Username:
m_result.username = move(value);
break;
case State::Password:
m_result.password = move(value);
break;
case State::Hostname:
m_result.hostname = move(value);
break;
case State::Port:
m_result.port = move(value);
break;
case State::Pathname:
m_result.pathname = move(value);
break;
case State::Search:
m_result.search = move(value);
break;
case State::Hash:
m_result.hash = move(value);
break;
case State::Initial:
case State::Authority:
case State::Done:
VERIFY_NOT_REACHED();
}
}
// https://urlpattern.spec.whatwg.org/#change-state
void ConstructorStringParser::change_state(State new_state, u32 skip)
{
// 1. If parsers state is not "init", not "authority", and not "done", then set parsers result[parsers state] to
// the result of running make a component string given parser.
if (m_state != State::Initial && m_state != State::Authority && m_state != State::Done)
set_result_for_active_state(make_a_component_string());
// 2. If parsers state is not "init" and new state is not "done", then:
if (m_state != State::Initial && new_state != State::Done) {
// 1. If parsers state is "protocol", "authority", "username", or "password"; new state is "port", "pathname",
// "search", or "hash"; and parsers result["hostname"] does not exist, then set parsers result["hostname"]
// to the empty string.
if (first_is_one_of(m_state, State::Protocol, State::Authority, State::Username, State::Password)
&& first_is_one_of(new_state, State::Port, State::Pathname, State::Search, State::Hash)
&& !m_result.hostname.has_value()) {
m_result.hostname = String {};
}
// 2. If parsers state is "protocol", "authority", "username", "password", "hostname", or "port"; new state is
// "search" or "hash"; and parsers result["pathname"] does not exist, then:
if (first_is_one_of(m_state, State::Protocol, State::Authority, State::Username, State::Password, State::Hostname, State::Port)
&& first_is_one_of(new_state, State::Search, State::Hash)
&& !m_result.pathname.has_value()) {
// 1. If parsers protocol matches a special scheme flag is true, then set parsers result["pathname"] to "/".
if (m_protocol_matches_a_special_scheme) {
m_result.pathname = "/"_string;
}
// 2. Otherwise, set parsers result["pathname"] to the empty string.
else {
m_result.pathname = String {};
}
}
// 3. If parsers state is "protocol", "authority", "username", "password", "hostname", "port", or "pathname";
// new state is "hash"; and parsers result["search"] does not exist, then set parsers result["search"]
// to the empty string.
if (first_is_one_of(m_state, State::Protocol, State::Authority, State::Username, State::Password, State::Hostname, State::Port, State::Pathname)
&& new_state == State::Hash
&& !m_result.search.has_value()) {
m_result.search = String {};
}
}
// 3. Set parsers state to new state.
m_state = new_state;
// 4. Increment parsers token index by skip.
m_token_index += skip;
// 5. Set parsers component start to parsers token index.
m_component_start = m_token_index;
// 6. Set parsers token increment to 0.
m_token_increment = 0;
}
// https://urlpattern.spec.whatwg.org/#next-is-authority-slashes
bool ConstructorStringParser::next_is_authority_slashes() const
{
// 1. If the result of running is a non-special pattern char given parser, parsers token index + 1, and "/" is false,
// then return false.
if (!is_a_non_special_pattern_char(m_token_index + 1, '/'))
return false;
// 2. If the result of running is a non-special pattern char given parser, parsers token index + 2, and "/" is false,
// then return false.
if (!is_a_non_special_pattern_char(m_token_index + 2, '/'))
return false;
// 3. Return true.
return true;
}
// https://urlpattern.spec.whatwg.org/#is-an-identity-terminator
bool ConstructorStringParser::is_an_identity_terminator() const
{
// 1. Return the result of running is a non-special pattern char given parser, parsers token index, and "@".
return is_a_non_special_pattern_char(m_token_index, '@');
}
// https://urlpattern.spec.whatwg.org/#is-a-password-prefix
bool ConstructorStringParser::is_a_password_prefix() const
{
// 1. Return the result of running is a non-special pattern char given parser, parsers token index, and ":".
return is_a_non_special_pattern_char(m_token_index, ':');
}
// https://urlpattern.spec.whatwg.org/#is-a-port-prefix
bool ConstructorStringParser::is_a_port_prefix() const
{
// 1. Return the result of running is a non-special pattern char given parser, parsers token index, and ":".
return is_a_non_special_pattern_char(m_token_index, ':');
}
// https://urlpattern.spec.whatwg.org/#is-a-pathname-start
bool ConstructorStringParser::is_a_pathname_start() const
{
// 1. Return the result of running is a non-special pattern char given parser, parsers token index, and "/".
return is_a_non_special_pattern_char(m_token_index, '/');
}
// https://urlpattern.spec.whatwg.org/#is-a-search-prefix
bool ConstructorStringParser::is_a_search_prefix() const
{
// 1. If result of running is a non-special pattern char given parser, parsers token index and "?" is true,
// then return true.
if (is_a_non_special_pattern_char(m_token_index, '?'))
return true;
// 2. If parsers token list[parsers token index]'s value is not "?", then return false.
if (m_token_list[m_token_index].value != "?"sv)
return false;
// 3. Let previous index be parsers token index 1.
// 4. If previous index is less than 0, then return true.
if (m_token_index == 0)
return true;
auto previous_index = m_token_index - 1;
// 5. Let previous token be the result of running get a safe token given parser and previous index.
auto const& previous_token = get_a_safe_token(previous_index);
// 6. If any of the following are true, then return false:
// * previous tokens type is "name".
// * previous tokens type is "regexp".
// * previous tokens type is "close".
// * previous tokens type is "asterisk".
if (previous_token.type == Token::Type::Name
|| previous_token.type == Token::Type::Regexp
|| previous_token.type == Token::Type::Close
|| previous_token.type == Token::Type::Asterisk) {
return false;
}
// 7. Return true.
return true;
}
// https://urlpattern.spec.whatwg.org/#is-a-protocol-suffix
bool ConstructorStringParser::is_a_protocol_suffix() const
{
// 1. Return the result of running is a non-special pattern char given parser, parsers token index, and ":".
return is_a_non_special_pattern_char(m_token_index, ':');
}
// https://urlpattern.spec.whatwg.org/#is-a-hash-prefix
bool ConstructorStringParser::is_a_hash_prefix() const
{
// 1. Return the result of running is a non-special pattern char given parser, parsers token index and "#".
return is_a_non_special_pattern_char(m_token_index, '#');
}
// https://urlpattern.spec.whatwg.org/#is-a-group-open
bool ConstructorStringParser::is_a_group_open() const
{
// 1. If parsers token list[parsers token index]'s type is "open", then return true.
if (m_token_list[m_token_index].type == Token::Type::Open)
return true;
// 2. Otherwise return false.
return false;
}
// https://urlpattern.spec.whatwg.org/#is-a-group-close
bool ConstructorStringParser::is_a_group_close() const
{
// 1. If parsers token list[parsers token index]'s type is "close", then return true.
if (m_token_list[m_token_index].type == Token::Type::Close)
return true;
// 2. Otherwise return false.
return false;
}
// https://urlpattern.spec.whatwg.org/#is-an-ipv6-open
bool ConstructorStringParser::is_an_ipv6_open() const
{
// 1. Return the result of running is a non-special pattern char given parser, parsers token index, and "[".
return is_a_non_special_pattern_char(m_token_index, '[');
}
// https://urlpattern.spec.whatwg.org/#is-an-ipv6-close
bool ConstructorStringParser::is_an_ipv6_close() const
{
// 1. Return the result of running is a non-special pattern char given parser, parsers token index, and "]".
return is_a_non_special_pattern_char(m_token_index, ']');
}
// https://urlpattern.spec.whatwg.org/#get-a-safe-token
Token const& ConstructorStringParser::get_a_safe_token(u32 index) const
{
// 1. If index is less than parsers token list's size, then return parsers token list[index].
if (index < m_token_list.size())
return m_token_list[index];
// 2. Assert: parsers token list's size is greater than or equal to 1.
VERIFY(!m_token_list.is_empty());
// 3. Let last index be parsers token list's size 1.
// 4. Let token be parsers token list[last index].
auto const& token = m_token_list.last();
// 5. Assert: tokens type is "end".
VERIFY(token.type == Token::Type::End);
// 6. Return token.
return token;
}
// https://urlpattern.spec.whatwg.org/#is-a-non-special-pattern-char
bool ConstructorStringParser::is_a_non_special_pattern_char(u32 index, char value) const
{
// 1. Let token be the result of running get a safe token given parser and index.
auto const& token = get_a_safe_token(index);
// 2. If tokens value is not value, then return false.
if (token.value.is_empty() || token.value.bytes().first() != value)
return false;
// 3. If any of the following are true:
// * tokens type is "char";
// * tokens type is "escaped-char"; or
// * tokens type is "invalid-char",
// then return true.
if (token.type == Token::Type::Char
|| token.type == Token::Type::EscapedChar
|| token.type == Token::Type::InvalidChar) {
return true;
}
// 4. Return false.
return false;
}
// https://urlpattern.spec.whatwg.org/#rewind
void ConstructorStringParser::rewind()
{
// 1. Set parsers token index to parsers component start.
m_token_index = m_component_start;
// 2. Set parsers token increment to 0.
m_token_increment = 0;
}
// https://urlpattern.spec.whatwg.org/#rewind-and-set-state
void ConstructorStringParser::rewind_and_set_state(State state)
{
// 1. Run rewind given parser.
rewind();
// 2. Set parsers state to state.
m_state = state;
}
}

View file

@ -0,0 +1,106 @@
/*
* Copyright (c) 2025, Shannon Booth <shannon@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <LibURL/Pattern/Init.h>
#include <LibURL/Pattern/PatternError.h>
#include <LibURL/Pattern/Tokenizer.h>
namespace URL::Pattern {
// https://urlpattern.spec.whatwg.org/#constructor-string-parser
class ConstructorStringParser {
public:
static PatternErrorOr<Init> parse(Utf8View const& input);
private:
ConstructorStringParser(Utf8View const& input, Vector<Token> token_list);
// https://urlpattern.spec.whatwg.org/#constructor-string-parser-state
enum class State {
Initial,
Protocol,
Authority,
Username,
Password,
Hostname,
Port,
Pathname,
Search,
Hash,
Done,
};
StringView state_to_string() const;
void rewind();
void rewind_and_set_state(State);
bool next_is_authority_slashes() const;
bool is_an_identity_terminator() const;
bool is_a_port_prefix() const;
bool is_a_pathname_start() const;
bool is_a_password_prefix() const;
bool is_a_search_prefix() const;
bool is_a_hash_prefix() const;
bool is_a_protocol_suffix() const;
bool is_an_ipv6_open() const;
bool is_an_ipv6_close() const;
bool is_a_group_open() const;
bool is_a_group_close() const;
Token const& get_a_safe_token(u32 index) const;
bool is_a_non_special_pattern_char(u32 index, char value) const;
void change_state(State, u32 skip);
String make_a_component_string() const;
PatternErrorOr<void> compute_protocol_matches_a_special_scheme_flag();
Optional<String> const& result_for_active_state() const;
void set_result_for_active_state(Optional<String> value);
// https://urlpattern.spec.whatwg.org/#constructor-string-parser-input
// A constructor string parser has an associated input, a string, which must be set upon creation.
Utf8View m_input;
// https://urlpattern.spec.whatwg.org/#constructor-string-parser-token-list
// A constructor string parser has an associated token list, a token list, which must be set upon creation.
Vector<Token> m_token_list;
// https://urlpattern.spec.whatwg.org/#constructor-string-parser-result
// A constructor string parser has an associated result, a URLPatternInit, initially set to a new URLPatternInit.
Init m_result;
// https://urlpattern.spec.whatwg.org/#constructor-string-parser-component-start
// A constructor string parser has an associated component start, a number, initially set to 0.
u32 m_component_start { 0 };
// https://urlpattern.spec.whatwg.org/#constructor-string-parser-token-index
// A constructor string parser has an associated token index, a number, initially set to 0.
u32 m_token_index { 0 };
// https://urlpattern.spec.whatwg.org/#constructor-string-parser-token-increment
// A constructor string parser has an associated token increment, a number, initially set to 1.
u32 m_token_increment { 1 };
// https://urlpattern.spec.whatwg.org/#constructor-string-parser-group-depth
// A constructor string parser has an associated group depth, a number, initially set to 0.
u32 m_group_depth { 0 };
// https://urlpattern.spec.whatwg.org/#constructor-string-parser-hostname-ipv6-bracket-depth
// A constructor string parser has an associated hostname IPv6 bracket depth, a number, initially set to 0.
u32 m_hostname_ipv6_bracket_depth { 0 };
// https://urlpattern.spec.whatwg.org/#constructor-string-parser-protocol-matches-a-special-scheme-flag
// A constructor string parser has an associated protocol matches a special scheme flag, a boolean, initially set to false.
bool m_protocol_matches_a_special_scheme { false };
// https://urlpattern.spec.whatwg.org/#constructor-string-parser-state
// A constructor string parser has an associated state, a string, initially set to "init".
State m_state { State::Initial };
};
}

View file

@ -1,5 +1,6 @@
set(URL_TEST_SOURCES
TestURL.cpp
TestURLPatternConstructorStringParser.cpp
)
foreach(source IN LISTS URL_TEST_SOURCES)

View file

@ -0,0 +1,174 @@
/*
* Copyright (c) 2025, Shannon Booth <shannon@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibTest/TestCase.h>
#include <LibURL/Pattern/ConstructorStringParser.h>
TEST_CASE(basic_http_url_no_pattern_or_path)
{
auto input = "http://www.serenityos.org"_string;
auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points()));
EXPECT_EQ(result.protocol, "http"sv);
EXPECT_EQ(result.username, OptionalNone {});
EXPECT_EQ(result.password, OptionalNone {});
EXPECT_EQ(result.hostname, "www.serenityos.org"sv);
EXPECT_EQ(result.port, ""sv);
EXPECT_EQ(result.pathname, OptionalNone {});
EXPECT_EQ(result.search, OptionalNone {});
EXPECT_EQ(result.hash, OptionalNone {});
EXPECT_EQ(result.base_url, OptionalNone {});
}
TEST_CASE(pathname_with_regexp)
{
auto input = "/books/(\\d+)"_string;
auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points()));
EXPECT_EQ(result.protocol, OptionalNone {});
EXPECT_EQ(result.username, OptionalNone {});
EXPECT_EQ(result.password, OptionalNone {});
EXPECT_EQ(result.hostname, OptionalNone {});
EXPECT_EQ(result.port, OptionalNone {});
EXPECT_EQ(result.pathname, "/books/(\\d+)"sv);
EXPECT_EQ(result.search, OptionalNone {});
EXPECT_EQ(result.hash, OptionalNone {});
EXPECT_EQ(result.base_url, OptionalNone {});
}
TEST_CASE(url_with_pathname_and_regexp)
{
auto input = "https://example.com/2022/feb/*"_string;
auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points()));
EXPECT_EQ(result.protocol, "https"sv);
EXPECT_EQ(result.username, OptionalNone {});
EXPECT_EQ(result.password, OptionalNone {});
EXPECT_EQ(result.hostname, "example.com"sv);
EXPECT_EQ(result.port, ""sv);
EXPECT_EQ(result.pathname, "/2022/feb/*"sv);
EXPECT_EQ(result.search, OptionalNone {});
EXPECT_EQ(result.hash, OptionalNone {});
EXPECT_EQ(result.base_url, OptionalNone {});
}
TEST_CASE(http_url_regexp_in_pathname_and_hostname)
{
auto input = "https://cdn-*.example.com/*.jpg"_string;
auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points()));
EXPECT_EQ(result.protocol, "https"sv);
EXPECT_EQ(result.username, OptionalNone {});
EXPECT_EQ(result.password, OptionalNone {});
EXPECT_EQ(result.hostname, "cdn-*.example.com"sv);
EXPECT_EQ(result.port, ""sv);
EXPECT_EQ(result.pathname, "/*.jpg"sv);
EXPECT_EQ(result.search, OptionalNone {});
EXPECT_EQ(result.hash, OptionalNone {});
EXPECT_EQ(result.base_url, OptionalNone {});
}
TEST_CASE(https_url_with_fragment)
{
auto input = "https://example.com/#foo"_string;
auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points()));
EXPECT_EQ(result.protocol, "https"sv);
EXPECT_EQ(result.username, OptionalNone {});
EXPECT_EQ(result.password, OptionalNone {});
EXPECT_EQ(result.hostname, "example.com"sv);
EXPECT_EQ(result.port, ""sv);
EXPECT_EQ(result.pathname, "/"sv);
EXPECT_EQ(result.search, ""sv);
EXPECT_EQ(result.hash, "foo"sv);
EXPECT_EQ(result.base_url, OptionalNone {});
}
TEST_CASE(http_url_with_query)
{
auto input = "https://example.com/?q=*&v=?&hmm={}&umm=()"_string;
auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points()));
EXPECT_EQ(result.protocol, "https"sv);
EXPECT_EQ(result.username, OptionalNone {});
EXPECT_EQ(result.password, OptionalNone {});
EXPECT_EQ(result.hostname, "example.com"sv);
EXPECT_EQ(result.port, ""sv);
EXPECT_EQ(result.pathname, "/"sv);
EXPECT_EQ(result.search, "q=*&v=?&hmm={}&umm=()"sv);
EXPECT_EQ(result.hash, OptionalNone {});
EXPECT_EQ(result.base_url, OptionalNone {});
}
TEST_CASE(matches_on_sub_url)
{
auto input = "https://{sub.}?example.com/foo"_string;
auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points()));
EXPECT_EQ(result.protocol, "https"sv);
EXPECT_EQ(result.username, OptionalNone {});
EXPECT_EQ(result.password, OptionalNone {});
EXPECT_EQ(result.hostname, "{sub.}?example.com"sv);
EXPECT_EQ(result.port, ""sv);
EXPECT_EQ(result.pathname, "/foo"sv);
EXPECT_EQ(result.search, OptionalNone {});
EXPECT_EQ(result.hash, OptionalNone {});
EXPECT_EQ(result.base_url, OptionalNone {});
}
TEST_CASE(ipv6_with_port_number)
{
auto input = "http://[\\:\\:1]:8080"_string;
auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points()));
EXPECT_EQ(result.protocol, "http"sv);
EXPECT_EQ(result.username, OptionalNone {});
EXPECT_EQ(result.password, OptionalNone {});
EXPECT_EQ(result.hostname, "[\\:\\:1]"sv);
EXPECT_EQ(result.port, "8080"sv);
EXPECT_EQ(result.pathname, OptionalNone {});
EXPECT_EQ(result.search, OptionalNone {});
EXPECT_EQ(result.hash, OptionalNone {});
EXPECT_EQ(result.base_url, OptionalNone {});
}
TEST_CASE(data_url)
{
auto input = "data\\:foobar"_string;
auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points()));
EXPECT_EQ(result.protocol, "data"sv);
EXPECT_EQ(result.username, OptionalNone {});
EXPECT_EQ(result.password, OptionalNone {});
EXPECT_EQ(result.hostname, ""sv);
EXPECT_EQ(result.port, ""sv);
EXPECT_EQ(result.pathname, "foobar"sv);
EXPECT_EQ(result.search, OptionalNone {});
EXPECT_EQ(result.hash, OptionalNone {});
EXPECT_EQ(result.base_url, OptionalNone {});
}
TEST_CASE(non_special_scheme_and_arbitary_hostname)
{
auto input = "foo://bar"_string;
auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points()));
EXPECT_EQ(result.protocol, "foo"sv);
EXPECT_EQ(result.username, OptionalNone {});
EXPECT_EQ(result.password, OptionalNone {});
EXPECT_EQ(result.hostname, "bar"sv);
EXPECT_EQ(result.port, ""sv);
EXPECT_EQ(result.pathname, OptionalNone {});
EXPECT_EQ(result.search, OptionalNone {});
EXPECT_EQ(result.hash, OptionalNone {});
EXPECT_EQ(result.base_url, OptionalNone {});
}
TEST_CASE(ipv6_with_named_group)
{
auto input = "http://[:address]/"_string;
auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points()));
EXPECT_EQ(result.protocol, "http"sv);
EXPECT_EQ(result.username, OptionalNone {});
EXPECT_EQ(result.password, OptionalNone {});
EXPECT_EQ(result.hostname, "[:address]"sv);
EXPECT_EQ(result.port, ""sv);
EXPECT_EQ(result.pathname, "/"sv);
EXPECT_EQ(result.search, OptionalNone {});
EXPECT_EQ(result.hash, OptionalNone {});
EXPECT_EQ(result.base_url, OptionalNone {});
}