ladybird/Libraries/LibURL/Pattern/ConstructorStringParser.cpp
Shannon Booth e369756e9c LibURL/Pattern: Implement the constructor string parser
This is missing one small bit of functionality where the not-yet
impplemented component compilation is required.
2025-03-15 07:39:03 -04:00

704 lines
30 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (c) 2025, Shannon Booth <shannon@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Debug.h>
#include <AK/GenericShorthands.h>
#include <LibURL/Pattern/Canonicalization.h>
#include <LibURL/Pattern/Component.h>
#include <LibURL/Pattern/ConstructorStringParser.h>
namespace URL::Pattern {
StringView ConstructorStringParser::state_to_string() const
{
switch (m_state) {
case State::Initial:
return "Initial"sv;
case State::Protocol:
return "Protocol"sv;
case State::Authority:
return "Authority"sv;
case State::Username:
return "Username"sv;
case State::Password:
return "Password"sv;
case State::Hostname:
return "Hostname"sv;
case State::Port:
return "Port"sv;
case State::Pathname:
return "Pathname"sv;
case State::Search:
return "Search"sv;
case State::Hash:
return "Hash"sv;
case State::Done:
return "Done"sv;
}
VERIFY_NOT_REACHED();
}
ConstructorStringParser::ConstructorStringParser(Utf8View const& input, Vector<Token> token_list)
: m_input(input)
, m_token_list(move(token_list))
{
}
// https://urlpattern.spec.whatwg.org/#parse-a-constructor-string
PatternErrorOr<Init> ConstructorStringParser::parse(Utf8View const& input)
{
// 1. Let parser be a new constructor string parser whose input is input and token list is the result of running
// tokenize given input and "lenient".
ConstructorStringParser parser { input, TRY(Tokenizer::tokenize(input, Tokenizer::Policy::Lenient)) };
// 2. While parsers token index is less than parsers token list size:
while (parser.m_token_index < parser.m_token_list.size()) {
dbgln_if(URL_PATTERN_DEBUG, "{}\t| Token@{} (group depth {}) -> {}", parser.state_to_string(),
parser.m_token_index, parser.m_group_depth, parser.m_token_list[parser.m_token_index].to_string());
// 1. Set parsers token increment to 1.
parser.m_token_increment = 1;
// NOTE: On every iteration of the parse loop the parsers token index will be incremented by its token
// increment value. Typically this means incrementing by 1, but at certain times it is set to zero.
// The token increment is then always reset back to 1 at the top of the loop.
// 2. If parsers token list[parsers token index]'s type is "end" then:
if (parser.m_token_list[parser.m_token_index].type == Token::Type::End) {
// 1. If parsers state is "init":
if (parser.m_state == State::Initial) {
// NOTE: If we reached the end of the string in the "init" state, then we failed to find a protocol
// terminator and this has to be a relative URLPattern constructor string.
// 1. Run rewind given parser.
parser.rewind();
// NOTE: We next determine at which component the relative pattern begins. Relative pathnames are
// most common, but URLs and URLPattern constructor strings can begin with the search or hash
// components as well.
// 2. If the result of running is a hash prefix given parser is true, then run change state given parser,
// "hash" and 1.
if (parser.is_a_hash_prefix()) {
parser.change_state(State::Hash, 1);
}
// 3. Otherwise if the result of running is a search prefix given parser is true:
else if (parser.is_a_search_prefix()) {
// 1. Run change state given parser, "search" and 1.
parser.change_state(State::Search, 1);
}
// 4. Otherwise:
else {
// 1. Run change state given parser, "pathname" and 0.
parser.change_state(State::Pathname, 0);
}
// 5. Increment parsers token index by parsers token increment.
parser.m_token_index += parser.m_token_increment;
// 6. Continue.
continue;
}
// 2. If parsers state is "authority":
if (parser.m_state == State::Authority) {
// NOTE: If we reached the end of the string in the "authority" state, then we failed to find an
// "@". Therefore there is no username or password.
// 1. Run rewind and set state given parser, and "hostname".
parser.rewind_and_set_state(State::Hostname);
// 2. Increment parsers token index by parsers token increment.
parser.m_token_index += parser.m_token_increment;
// 3. Continue.
continue;
}
// 3. Run change state given parser, "done" and 0.
parser.change_state(State::Done, 0);
// 4. Break.
break;
}
// 3. If the result of running is a group open given parser is true:
if (parser.is_a_group_open()) {
// NOTE: We ignore all code points within "{ ... }" pattern groupings. It would not make sense to allow
// a URL component boundary to lie within a grouping; e.g. "https://example.c{om/fo}o". While not
// supported within well formed pattern strings, we handle nested groupings here to avoid parser
// confusion.
//
// It is not necessary to perform this logic for regexp or named groups since those values are collapsed into
// individual tokens by the tokenize algorithm.
// 1. Increment parsers group depth by 1.
++parser.m_group_depth;
// 2. Increment parsers token index by parsers token increment.
parser.m_token_index += parser.m_token_increment;
// 3. Continue.
continue;
}
// 4. If parsers group depth is greater than 0:
if (parser.m_group_depth > 0) {
// 1. If the result of running is a group close given parser is true, then decrement parsers group depth by 1.
if (parser.is_a_group_close()) {
VERIFY(parser.m_group_depth != 0);
--parser.m_group_depth;
}
// 2. Otherwise:
else {
// 1. Increment parsers token index by parsers token increment.
parser.m_token_index += parser.m_token_increment;
// 2. Continue.
continue;
}
}
// 5. Switch on parsers state and run the associated steps:
switch (parser.m_state) {
// -> "init", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-init%E2%91%A2
case State::Initial: {
// 1. If the result of running is a protocol suffix given parser is true:
if (parser.is_a_protocol_suffix()) {
// 1. Run rewind and set state given parser and "protocol".
parser.rewind_and_set_state(State::Protocol);
}
break;
}
// -> "protocol", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-protocol%E2%91%A0
case State::Protocol: {
// 1. If the result of running is a protocol suffix given parser is true:
if (parser.is_a_protocol_suffix()) {
// 1. Run compute protocol matches a special scheme flag given parser.
TRY(parser.compute_protocol_matches_a_special_scheme_flag());
// NOTE: We need to eagerly compile the protocol component to determine if it matches any special
// schemes. If it does then certain special rules apply. It determines if the pathname
// defaults to a "/" and also whether we will look for the username, password, hostname, and
// port components. Authority slashes can also cause us to look for these components as well.
// Otherwise we treat this as an "opaque path URL" and go straight to the pathname component.
// 2. Let next state be "pathname".
auto next_state = State::Pathname;
// 3. Let skip be 1.
u32 skip = 1;
// 4. If the result of running next is authority slashes given parser is true:
if (parser.next_is_authority_slashes()) {
// 1. Set next state to "authority".
next_state = State::Authority;
// 2. Set skip to 3.
skip = 3;
}
// 5. Otherwise if parsers protocol matches a special scheme flag is true, then set next state to "authority".
else if (parser.m_protocol_matches_a_special_scheme) {
next_state = State::Authority;
}
// 6. Run change state given parser, next state, and skip.
parser.change_state(next_state, skip);
}
break;
}
// -> "authority", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-authority%E2%91%A3
case State::Authority: {
// 1. If the result of running is an identity terminator given parser is true, then run rewind and set state
// given parser and "username".
if (parser.is_an_identity_terminator()) {
parser.rewind_and_set_state(State::Username);
}
// 2. Otherwise if any of the following are true:
// * the result of running is a pathname start given parser;
// * the result of running is a search prefix given parser; or
// * the result of running is a hash prefix given parser,
// then run rewind and set state given parser and "hostname".
else if (parser.is_a_pathname_start()
|| parser.is_a_search_prefix()
|| parser.is_a_hash_prefix()) {
parser.rewind_and_set_state(State::Hostname);
}
break;
}
// -> "username", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-username%E2%91%A0
case State::Username: {
// 1. If the result of running is a password prefix given parser is true, then run change state given
// parser, "password", and 1.
if (parser.is_a_password_prefix()) {
parser.change_state(State::Password, 1);
}
// 2. Otherwise if the result of running is an identity terminator given parser is true, then run change
// state given parser, "hostname", and 1.
else if (parser.is_an_identity_terminator()) {
parser.change_state(State::Hostname, 1);
}
break;
}
// -> "password", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-password%E2%91%A0
case State::Password: {
// 1. If the result of running is an identity terminator given parser is true, then run change state
// given parser, "hostname", and 1.
if (parser.is_an_identity_terminator())
parser.change_state(State::Hostname, 1);
break;
}
// -> "hostname", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-hostname%E2%91%A3
case State::Hostname: {
// 1. If the result of running is an IPv6 open given parser is true, then increment parsers hostname
// IPv6 bracket depth by 1.
if (parser.is_an_ipv6_open()) {
++parser.m_hostname_ipv6_bracket_depth;
}
// 2. Otherwise if the result of running is an IPv6 close given parser is true, then decrement parsers
// hostname IPv6 bracket depth by 1.
else if (parser.is_an_ipv6_close()) {
VERIFY(parser.m_hostname_ipv6_bracket_depth != 0);
--parser.m_hostname_ipv6_bracket_depth;
}
// 3. Otherwise if the result of running is a port prefix given parser is true and parsers hostname IPv6
// bracket depth is zero, then run change state given parser, "port", and 1.
else if (parser.is_a_port_prefix() && parser.m_hostname_ipv6_bracket_depth == 0) {
parser.change_state(State::Port, 1);
}
// 4. Otherwise if the result of running is a pathname start given parser is true, then run change state
// given parser, "pathname", and 0.
else if (parser.is_a_pathname_start()) {
parser.change_state(State::Pathname, 0);
}
// 5. Otherwise if the result of running is a search prefix given parser is true, then run change state
// given parser, "search", and 1.
else if (parser.is_a_search_prefix()) {
parser.change_state(State::Search, 1);
}
// 6. Otherwise if the result of running is a hash prefix given parser is true, then run change state
// given parser, "hash", and 1.
else if (parser.is_a_hash_prefix()) {
parser.change_state(State::Hash, 1);
}
break;
}
// -> "port", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-port%E2%91%A0
case State::Port: {
// 1. If the result of running is a pathname start given parser is true, then run change state given
// parser, "pathname", and 0.
if (parser.is_a_pathname_start()) {
parser.change_state(State::Pathname, 0);
}
// 2. Otherwise if the result of running is a search prefix given parser is true, then run change state
// given parser, "search", and 1.
else if (parser.is_a_search_prefix()) {
parser.change_state(State::Search, 1);
}
// 3. Otherwise if the result of running is a hash prefix given parser is true, then run change state given
// parser, "hash", and 1.
else if (parser.is_a_hash_prefix()) {
parser.change_state(State::Hash, 1);
}
break;
}
// -> "pathname", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-pathname%E2%91%A3
case State::Pathname: {
// 1. If the result of running is a search prefix given parser is true, then run change state given parser,
// "search", and 1.
if (parser.is_a_search_prefix()) {
parser.change_state(State::Search, 1);
}
// 2. Otherwise if the result of running is a hash prefix given parser is true, then run change state given
// parser, "hash", and 1.
else if (parser.is_a_hash_prefix()) {
parser.change_state(State::Hash, 1);
}
break;
}
// -> "search", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-search%E2%91%A3
case State::Search: {
// 1. If the result of running is a hash prefix given parser is true, then run change state given parser,
// "hash", and 1.
if (parser.is_a_hash_prefix())
parser.change_state(State::Hash, 1);
break;
}
// -> "hash", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-hash%E2%91%A4
case State::Hash: {
// 1. Do nothing.
break;
}
// -> "done", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-done%E2%91%A0
case State::Done: {
// 1. Assert: This step is never reached.
VERIFY_NOT_REACHED();
}
}
// 6. Increment parsers token index by parsers token increment.
parser.m_token_index += parser.m_token_increment;
}
// 3. If parsers result contains "hostname" and not "port", then set parsers result["port"] to the empty string.
if (parser.m_result.hostname.has_value() && !parser.m_result.port.has_value())
parser.m_result.port = String {};
// NOTE: This is special-cased because when an author does not specify a port, they usually intend the default
// port. If any port is acceptable, the author can specify it as a wildcard explicitly. For example,
// "https://example.com/*" does not match URLs beginning with "https://example.com:8443/", which is a
// different origin.
// 4. Return parsers result.
return parser.m_result;
}
// https://urlpattern.spec.whatwg.org/#make-a-component-string
String ConstructorStringParser::make_a_component_string() const
{
// 1. Assert: parsers token index is less than parsers token list's size.
VERIFY(m_token_index < m_token_list.size());
// 2. Let token be parsers token list[parsers token index].
auto const& token = m_token_list[m_token_index];
// 3. Let component start token be the result of running get a safe token given parser and parsers component start.
auto const& component_start_token = get_a_safe_token(m_component_start);
// 4. Let component start input index be component start tokens index.
auto component_start_input_index = component_start_token.index;
// 5. Let end index be tokens index.
auto end_index = token.index;
// 6. Return the code point substring from component start input index to end index within parsers input.
auto sub_view = m_input.unicode_substring_view(component_start_input_index, end_index - component_start_input_index);
return MUST(String::from_utf8(sub_view.as_string()));
}
// https://urlpattern.spec.whatwg.org/#compute-protocol-matches-a-special-scheme-flag
PatternErrorOr<void> ConstructorStringParser::compute_protocol_matches_a_special_scheme_flag()
{
// FIXME: Implement this.
return {};
}
Optional<String> const& ConstructorStringParser::result_for_active_state() const
{
switch (m_state) {
case State::Protocol:
return m_result.protocol;
case State::Username:
return m_result.username;
case State::Password:
return m_result.password;
case State::Hostname:
return m_result.hostname;
case State::Port:
return m_result.port;
case State::Pathname:
return m_result.pathname;
case State::Search:
return m_result.search;
case State::Hash:
return m_result.hash;
case State::Initial:
case State::Authority:
case State::Done:
break;
}
VERIFY_NOT_REACHED();
}
void ConstructorStringParser::set_result_for_active_state(Optional<String> value)
{
switch (m_state) {
case State::Protocol:
m_result.protocol = move(value);
break;
case State::Username:
m_result.username = move(value);
break;
case State::Password:
m_result.password = move(value);
break;
case State::Hostname:
m_result.hostname = move(value);
break;
case State::Port:
m_result.port = move(value);
break;
case State::Pathname:
m_result.pathname = move(value);
break;
case State::Search:
m_result.search = move(value);
break;
case State::Hash:
m_result.hash = move(value);
break;
case State::Initial:
case State::Authority:
case State::Done:
VERIFY_NOT_REACHED();
}
}
// https://urlpattern.spec.whatwg.org/#change-state
void ConstructorStringParser::change_state(State new_state, u32 skip)
{
// 1. If parsers state is not "init", not "authority", and not "done", then set parsers result[parsers state] to
// the result of running make a component string given parser.
if (m_state != State::Initial && m_state != State::Authority && m_state != State::Done)
set_result_for_active_state(make_a_component_string());
// 2. If parsers state is not "init" and new state is not "done", then:
if (m_state != State::Initial && new_state != State::Done) {
// 1. If parsers state is "protocol", "authority", "username", or "password"; new state is "port", "pathname",
// "search", or "hash"; and parsers result["hostname"] does not exist, then set parsers result["hostname"]
// to the empty string.
if (first_is_one_of(m_state, State::Protocol, State::Authority, State::Username, State::Password)
&& first_is_one_of(new_state, State::Port, State::Pathname, State::Search, State::Hash)
&& !m_result.hostname.has_value()) {
m_result.hostname = String {};
}
// 2. If parsers state is "protocol", "authority", "username", "password", "hostname", or "port"; new state is
// "search" or "hash"; and parsers result["pathname"] does not exist, then:
if (first_is_one_of(m_state, State::Protocol, State::Authority, State::Username, State::Password, State::Hostname, State::Port)
&& first_is_one_of(new_state, State::Search, State::Hash)
&& !m_result.pathname.has_value()) {
// 1. If parsers protocol matches a special scheme flag is true, then set parsers result["pathname"] to "/".
if (m_protocol_matches_a_special_scheme) {
m_result.pathname = "/"_string;
}
// 2. Otherwise, set parsers result["pathname"] to the empty string.
else {
m_result.pathname = String {};
}
}
// 3. If parsers state is "protocol", "authority", "username", "password", "hostname", "port", or "pathname";
// new state is "hash"; and parsers result["search"] does not exist, then set parsers result["search"]
// to the empty string.
if (first_is_one_of(m_state, State::Protocol, State::Authority, State::Username, State::Password, State::Hostname, State::Port, State::Pathname)
&& new_state == State::Hash
&& !m_result.search.has_value()) {
m_result.search = String {};
}
}
// 3. Set parsers state to new state.
m_state = new_state;
// 4. Increment parsers token index by skip.
m_token_index += skip;
// 5. Set parsers component start to parsers token index.
m_component_start = m_token_index;
// 6. Set parsers token increment to 0.
m_token_increment = 0;
}
// https://urlpattern.spec.whatwg.org/#next-is-authority-slashes
bool ConstructorStringParser::next_is_authority_slashes() const
{
// 1. If the result of running is a non-special pattern char given parser, parsers token index + 1, and "/" is false,
// then return false.
if (!is_a_non_special_pattern_char(m_token_index + 1, '/'))
return false;
// 2. If the result of running is a non-special pattern char given parser, parsers token index + 2, and "/" is false,
// then return false.
if (!is_a_non_special_pattern_char(m_token_index + 2, '/'))
return false;
// 3. Return true.
return true;
}
// https://urlpattern.spec.whatwg.org/#is-an-identity-terminator
bool ConstructorStringParser::is_an_identity_terminator() const
{
// 1. Return the result of running is a non-special pattern char given parser, parsers token index, and "@".
return is_a_non_special_pattern_char(m_token_index, '@');
}
// https://urlpattern.spec.whatwg.org/#is-a-password-prefix
bool ConstructorStringParser::is_a_password_prefix() const
{
// 1. Return the result of running is a non-special pattern char given parser, parsers token index, and ":".
return is_a_non_special_pattern_char(m_token_index, ':');
}
// https://urlpattern.spec.whatwg.org/#is-a-port-prefix
bool ConstructorStringParser::is_a_port_prefix() const
{
// 1. Return the result of running is a non-special pattern char given parser, parsers token index, and ":".
return is_a_non_special_pattern_char(m_token_index, ':');
}
// https://urlpattern.spec.whatwg.org/#is-a-pathname-start
bool ConstructorStringParser::is_a_pathname_start() const
{
// 1. Return the result of running is a non-special pattern char given parser, parsers token index, and "/".
return is_a_non_special_pattern_char(m_token_index, '/');
}
// https://urlpattern.spec.whatwg.org/#is-a-search-prefix
bool ConstructorStringParser::is_a_search_prefix() const
{
// 1. If result of running is a non-special pattern char given parser, parsers token index and "?" is true,
// then return true.
if (is_a_non_special_pattern_char(m_token_index, '?'))
return true;
// 2. If parsers token list[parsers token index]'s value is not "?", then return false.
if (m_token_list[m_token_index].value != "?"sv)
return false;
// 3. Let previous index be parsers token index 1.
// 4. If previous index is less than 0, then return true.
if (m_token_index == 0)
return true;
auto previous_index = m_token_index - 1;
// 5. Let previous token be the result of running get a safe token given parser and previous index.
auto const& previous_token = get_a_safe_token(previous_index);
// 6. If any of the following are true, then return false:
// * previous tokens type is "name".
// * previous tokens type is "regexp".
// * previous tokens type is "close".
// * previous tokens type is "asterisk".
if (previous_token.type == Token::Type::Name
|| previous_token.type == Token::Type::Regexp
|| previous_token.type == Token::Type::Close
|| previous_token.type == Token::Type::Asterisk) {
return false;
}
// 7. Return true.
return true;
}
// https://urlpattern.spec.whatwg.org/#is-a-protocol-suffix
bool ConstructorStringParser::is_a_protocol_suffix() const
{
// 1. Return the result of running is a non-special pattern char given parser, parsers token index, and ":".
return is_a_non_special_pattern_char(m_token_index, ':');
}
// https://urlpattern.spec.whatwg.org/#is-a-hash-prefix
bool ConstructorStringParser::is_a_hash_prefix() const
{
// 1. Return the result of running is a non-special pattern char given parser, parsers token index and "#".
return is_a_non_special_pattern_char(m_token_index, '#');
}
// https://urlpattern.spec.whatwg.org/#is-a-group-open
bool ConstructorStringParser::is_a_group_open() const
{
// 1. If parsers token list[parsers token index]'s type is "open", then return true.
if (m_token_list[m_token_index].type == Token::Type::Open)
return true;
// 2. Otherwise return false.
return false;
}
// https://urlpattern.spec.whatwg.org/#is-a-group-close
bool ConstructorStringParser::is_a_group_close() const
{
// 1. If parsers token list[parsers token index]'s type is "close", then return true.
if (m_token_list[m_token_index].type == Token::Type::Close)
return true;
// 2. Otherwise return false.
return false;
}
// https://urlpattern.spec.whatwg.org/#is-an-ipv6-open
bool ConstructorStringParser::is_an_ipv6_open() const
{
// 1. Return the result of running is a non-special pattern char given parser, parsers token index, and "[".
return is_a_non_special_pattern_char(m_token_index, '[');
}
// https://urlpattern.spec.whatwg.org/#is-an-ipv6-close
bool ConstructorStringParser::is_an_ipv6_close() const
{
// 1. Return the result of running is a non-special pattern char given parser, parsers token index, and "]".
return is_a_non_special_pattern_char(m_token_index, ']');
}
// https://urlpattern.spec.whatwg.org/#get-a-safe-token
Token const& ConstructorStringParser::get_a_safe_token(u32 index) const
{
// 1. If index is less than parsers token list's size, then return parsers token list[index].
if (index < m_token_list.size())
return m_token_list[index];
// 2. Assert: parsers token list's size is greater than or equal to 1.
VERIFY(!m_token_list.is_empty());
// 3. Let last index be parsers token list's size 1.
// 4. Let token be parsers token list[last index].
auto const& token = m_token_list.last();
// 5. Assert: tokens type is "end".
VERIFY(token.type == Token::Type::End);
// 6. Return token.
return token;
}
// https://urlpattern.spec.whatwg.org/#is-a-non-special-pattern-char
bool ConstructorStringParser::is_a_non_special_pattern_char(u32 index, char value) const
{
// 1. Let token be the result of running get a safe token given parser and index.
auto const& token = get_a_safe_token(index);
// 2. If tokens value is not value, then return false.
if (token.value.is_empty() || token.value.bytes().first() != value)
return false;
// 3. If any of the following are true:
// * tokens type is "char";
// * tokens type is "escaped-char"; or
// * tokens type is "invalid-char",
// then return true.
if (token.type == Token::Type::Char
|| token.type == Token::Type::EscapedChar
|| token.type == Token::Type::InvalidChar) {
return true;
}
// 4. Return false.
return false;
}
// https://urlpattern.spec.whatwg.org/#rewind
void ConstructorStringParser::rewind()
{
// 1. Set parsers token index to parsers component start.
m_token_index = m_component_start;
// 2. Set parsers token increment to 0.
m_token_increment = 0;
}
// https://urlpattern.spec.whatwg.org/#rewind-and-set-state
void ConstructorStringParser::rewind_and_set_state(State state)
{
// 1. Run rewind given parser.
rewind();
// 2. Set parsers state to state.
m_state = state;
}
}