diff --git a/Libraries/LibURL/CMakeLists.txt b/Libraries/LibURL/CMakeLists.txt index cfec8cadb9d..63f9bd7d469 100644 --- a/Libraries/LibURL/CMakeLists.txt +++ b/Libraries/LibURL/CMakeLists.txt @@ -8,6 +8,7 @@ set(SOURCES URL.cpp ${PUBLIC_SUFFIX_SOURCES} Pattern/Canonicalization.cpp + Pattern/ConstructorStringParser.cpp Pattern/Pattern.cpp Pattern/Tokenizer.cpp ) diff --git a/Libraries/LibURL/Pattern/ConstructorStringParser.cpp b/Libraries/LibURL/Pattern/ConstructorStringParser.cpp new file mode 100644 index 00000000000..a6a1f50bc31 --- /dev/null +++ b/Libraries/LibURL/Pattern/ConstructorStringParser.cpp @@ -0,0 +1,704 @@ +/* + * Copyright (c) 2025, Shannon Booth + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include +#include +#include + +namespace URL::Pattern { + +StringView ConstructorStringParser::state_to_string() const +{ + switch (m_state) { + case State::Initial: + return "Initial"sv; + case State::Protocol: + return "Protocol"sv; + case State::Authority: + return "Authority"sv; + case State::Username: + return "Username"sv; + case State::Password: + return "Password"sv; + case State::Hostname: + return "Hostname"sv; + case State::Port: + return "Port"sv; + case State::Pathname: + return "Pathname"sv; + case State::Search: + return "Search"sv; + case State::Hash: + return "Hash"sv; + case State::Done: + return "Done"sv; + } + VERIFY_NOT_REACHED(); +} + +ConstructorStringParser::ConstructorStringParser(Utf8View const& input, Vector token_list) + : m_input(input) + , m_token_list(move(token_list)) +{ +} + +// https://urlpattern.spec.whatwg.org/#parse-a-constructor-string +PatternErrorOr ConstructorStringParser::parse(Utf8View const& input) +{ + // 1. Let parser be a new constructor string parser whose input is input and token list is the result of running + // tokenize given input and "lenient". + ConstructorStringParser parser { input, TRY(Tokenizer::tokenize(input, Tokenizer::Policy::Lenient)) }; + + // 2. While parser’s token index is less than parser’s token list size: + while (parser.m_token_index < parser.m_token_list.size()) { + dbgln_if(URL_PATTERN_DEBUG, "{}\t| Token@{} (group depth {}) -> {}", parser.state_to_string(), + parser.m_token_index, parser.m_group_depth, parser.m_token_list[parser.m_token_index].to_string()); + // 1. Set parser’s token increment to 1. + parser.m_token_increment = 1; + + // NOTE: On every iteration of the parse loop the parser’s token index will be incremented by its token + // increment value. Typically this means incrementing by 1, but at certain times it is set to zero. + // The token increment is then always reset back to 1 at the top of the loop. + + // 2. If parser’s token list[parser’s token index]'s type is "end" then: + if (parser.m_token_list[parser.m_token_index].type == Token::Type::End) { + // 1. If parser’s state is "init": + if (parser.m_state == State::Initial) { + // NOTE: If we reached the end of the string in the "init" state, then we failed to find a protocol + // terminator and this has to be a relative URLPattern constructor string. + + // 1. Run rewind given parser. + parser.rewind(); + + // NOTE: We next determine at which component the relative pattern begins. Relative pathnames are + // most common, but URLs and URLPattern constructor strings can begin with the search or hash + // components as well. + + // 2. If the result of running is a hash prefix given parser is true, then run change state given parser, + // "hash" and 1. + if (parser.is_a_hash_prefix()) { + parser.change_state(State::Hash, 1); + } + // 3. Otherwise if the result of running is a search prefix given parser is true: + else if (parser.is_a_search_prefix()) { + // 1. Run change state given parser, "search" and 1. + parser.change_state(State::Search, 1); + } + // 4. Otherwise: + else { + // 1. Run change state given parser, "pathname" and 0. + parser.change_state(State::Pathname, 0); + } + + // 5. Increment parser’s token index by parser’s token increment. + parser.m_token_index += parser.m_token_increment; + + // 6. Continue. + continue; + } + + // 2. If parser’s state is "authority": + if (parser.m_state == State::Authority) { + // NOTE: If we reached the end of the string in the "authority" state, then we failed to find an + // "@". Therefore there is no username or password. + + // 1. Run rewind and set state given parser, and "hostname". + parser.rewind_and_set_state(State::Hostname); + + // 2. Increment parser’s token index by parser’s token increment. + parser.m_token_index += parser.m_token_increment; + + // 3. Continue. + continue; + } + + // 3. Run change state given parser, "done" and 0. + parser.change_state(State::Done, 0); + + // 4. Break. + break; + } + + // 3. If the result of running is a group open given parser is true: + if (parser.is_a_group_open()) { + // NOTE: We ignore all code points within "{ ... }" pattern groupings. It would not make sense to allow + // a URL component boundary to lie within a grouping; e.g. "https://example.c{om/fo}o". While not + // supported within well formed pattern strings, we handle nested groupings here to avoid parser + // confusion. + // + // It is not necessary to perform this logic for regexp or named groups since those values are collapsed into + // individual tokens by the tokenize algorithm. + + // 1. Increment parser’s group depth by 1. + ++parser.m_group_depth; + + // 2. Increment parser’s token index by parser’s token increment. + parser.m_token_index += parser.m_token_increment; + + // 3. Continue. + continue; + } + + // 4. If parser’s group depth is greater than 0: + if (parser.m_group_depth > 0) { + // 1. If the result of running is a group close given parser is true, then decrement parser’s group depth by 1. + if (parser.is_a_group_close()) { + VERIFY(parser.m_group_depth != 0); + --parser.m_group_depth; + } + // 2. Otherwise: + else { + // 1. Increment parser’s token index by parser’s token increment. + parser.m_token_index += parser.m_token_increment; + + // 2. Continue. + continue; + } + } + + // 5. Switch on parser’s state and run the associated steps: + switch (parser.m_state) { + // -> "init", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-init%E2%91%A2 + case State::Initial: { + // 1. If the result of running is a protocol suffix given parser is true: + if (parser.is_a_protocol_suffix()) { + // 1. Run rewind and set state given parser and "protocol". + parser.rewind_and_set_state(State::Protocol); + } + break; + } + // -> "protocol", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-protocol%E2%91%A0 + case State::Protocol: { + // 1. If the result of running is a protocol suffix given parser is true: + if (parser.is_a_protocol_suffix()) { + // 1. Run compute protocol matches a special scheme flag given parser. + TRY(parser.compute_protocol_matches_a_special_scheme_flag()); + + // NOTE: We need to eagerly compile the protocol component to determine if it matches any special + // schemes. If it does then certain special rules apply. It determines if the pathname + // defaults to a "/" and also whether we will look for the username, password, hostname, and + // port components. Authority slashes can also cause us to look for these components as well. + // Otherwise we treat this as an "opaque path URL" and go straight to the pathname component. + + // 2. Let next state be "pathname". + auto next_state = State::Pathname; + + // 3. Let skip be 1. + u32 skip = 1; + + // 4. If the result of running next is authority slashes given parser is true: + if (parser.next_is_authority_slashes()) { + // 1. Set next state to "authority". + next_state = State::Authority; + + // 2. Set skip to 3. + skip = 3; + } + // 5. Otherwise if parser’s protocol matches a special scheme flag is true, then set next state to "authority". + else if (parser.m_protocol_matches_a_special_scheme) { + next_state = State::Authority; + } + + // 6. Run change state given parser, next state, and skip. + parser.change_state(next_state, skip); + } + break; + } + // -> "authority", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-authority%E2%91%A3 + case State::Authority: { + // 1. If the result of running is an identity terminator given parser is true, then run rewind and set state + // given parser and "username". + if (parser.is_an_identity_terminator()) { + parser.rewind_and_set_state(State::Username); + } + // 2. Otherwise if any of the following are true: + // * the result of running is a pathname start given parser; + // * the result of running is a search prefix given parser; or + // * the result of running is a hash prefix given parser, + // then run rewind and set state given parser and "hostname". + else if (parser.is_a_pathname_start() + || parser.is_a_search_prefix() + || parser.is_a_hash_prefix()) { + parser.rewind_and_set_state(State::Hostname); + } + break; + } + // -> "username", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-username%E2%91%A0 + case State::Username: { + // 1. If the result of running is a password prefix given parser is true, then run change state given + // parser, "password", and 1. + if (parser.is_a_password_prefix()) { + parser.change_state(State::Password, 1); + } + // 2. Otherwise if the result of running is an identity terminator given parser is true, then run change + // state given parser, "hostname", and 1. + else if (parser.is_an_identity_terminator()) { + parser.change_state(State::Hostname, 1); + } + break; + } + // -> "password", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-password%E2%91%A0 + case State::Password: { + // 1. If the result of running is an identity terminator given parser is true, then run change state + // given parser, "hostname", and 1. + if (parser.is_an_identity_terminator()) + parser.change_state(State::Hostname, 1); + break; + } + // -> "hostname", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-hostname%E2%91%A3 + case State::Hostname: { + // 1. If the result of running is an IPv6 open given parser is true, then increment parser’s hostname + // IPv6 bracket depth by 1. + if (parser.is_an_ipv6_open()) { + ++parser.m_hostname_ipv6_bracket_depth; + } + // 2. Otherwise if the result of running is an IPv6 close given parser is true, then decrement parser’s + // hostname IPv6 bracket depth by 1. + else if (parser.is_an_ipv6_close()) { + VERIFY(parser.m_hostname_ipv6_bracket_depth != 0); + --parser.m_hostname_ipv6_bracket_depth; + } + // 3. Otherwise if the result of running is a port prefix given parser is true and parser’s hostname IPv6 + // bracket depth is zero, then run change state given parser, "port", and 1. + else if (parser.is_a_port_prefix() && parser.m_hostname_ipv6_bracket_depth == 0) { + parser.change_state(State::Port, 1); + } + // 4. Otherwise if the result of running is a pathname start given parser is true, then run change state + // given parser, "pathname", and 0. + else if (parser.is_a_pathname_start()) { + parser.change_state(State::Pathname, 0); + } + // 5. Otherwise if the result of running is a search prefix given parser is true, then run change state + // given parser, "search", and 1. + else if (parser.is_a_search_prefix()) { + parser.change_state(State::Search, 1); + } + // 6. Otherwise if the result of running is a hash prefix given parser is true, then run change state + // given parser, "hash", and 1. + else if (parser.is_a_hash_prefix()) { + parser.change_state(State::Hash, 1); + } + break; + } + // -> "port", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-port%E2%91%A0 + case State::Port: { + // 1. If the result of running is a pathname start given parser is true, then run change state given + // parser, "pathname", and 0. + if (parser.is_a_pathname_start()) { + parser.change_state(State::Pathname, 0); + } + // 2. Otherwise if the result of running is a search prefix given parser is true, then run change state + // given parser, "search", and 1. + else if (parser.is_a_search_prefix()) { + parser.change_state(State::Search, 1); + } + // 3. Otherwise if the result of running is a hash prefix given parser is true, then run change state given + // parser, "hash", and 1. + else if (parser.is_a_hash_prefix()) { + parser.change_state(State::Hash, 1); + } + break; + } + // -> "pathname", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-pathname%E2%91%A3 + case State::Pathname: { + // 1. If the result of running is a search prefix given parser is true, then run change state given parser, + // "search", and 1. + if (parser.is_a_search_prefix()) { + parser.change_state(State::Search, 1); + } + // 2. Otherwise if the result of running is a hash prefix given parser is true, then run change state given + // parser, "hash", and 1. + else if (parser.is_a_hash_prefix()) { + parser.change_state(State::Hash, 1); + } + break; + } + // -> "search", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-search%E2%91%A3 + case State::Search: { + // 1. If the result of running is a hash prefix given parser is true, then run change state given parser, + // "hash", and 1. + if (parser.is_a_hash_prefix()) + parser.change_state(State::Hash, 1); + break; + } + // -> "hash", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-hash%E2%91%A4 + case State::Hash: { + // 1. Do nothing. + break; + } + // -> "done", https://urlpattern.spec.whatwg.org/#ref-for-constructor-string-parser-state-done%E2%91%A0 + case State::Done: { + // 1. Assert: This step is never reached. + VERIFY_NOT_REACHED(); + } + } + + // 6. Increment parser’s token index by parser’s token increment. + parser.m_token_index += parser.m_token_increment; + } + + // 3. If parser’s result contains "hostname" and not "port", then set parser’s result["port"] to the empty string. + if (parser.m_result.hostname.has_value() && !parser.m_result.port.has_value()) + parser.m_result.port = String {}; + + // NOTE: This is special-cased because when an author does not specify a port, they usually intend the default + // port. If any port is acceptable, the author can specify it as a wildcard explicitly. For example, + // "https://example.com/*" does not match URLs beginning with "https://example.com:8443/", which is a + // different origin. + + // 4. Return parser’s result. + return parser.m_result; +} + +// https://urlpattern.spec.whatwg.org/#make-a-component-string +String ConstructorStringParser::make_a_component_string() const +{ + // 1. Assert: parser’s token index is less than parser’s token list's size. + VERIFY(m_token_index < m_token_list.size()); + + // 2. Let token be parser’s token list[parser’s token index]. + auto const& token = m_token_list[m_token_index]; + + // 3. Let component start token be the result of running get a safe token given parser and parser’s component start. + auto const& component_start_token = get_a_safe_token(m_component_start); + + // 4. Let component start input index be component start token’s index. + auto component_start_input_index = component_start_token.index; + + // 5. Let end index be token’s index. + auto end_index = token.index; + + // 6. Return the code point substring from component start input index to end index within parser’s input. + auto sub_view = m_input.unicode_substring_view(component_start_input_index, end_index - component_start_input_index); + return MUST(String::from_utf8(sub_view.as_string())); +} + +// https://urlpattern.spec.whatwg.org/#compute-protocol-matches-a-special-scheme-flag +PatternErrorOr ConstructorStringParser::compute_protocol_matches_a_special_scheme_flag() +{ + // FIXME: Implement this. + return {}; +} + +Optional const& ConstructorStringParser::result_for_active_state() const +{ + switch (m_state) { + case State::Protocol: + return m_result.protocol; + case State::Username: + return m_result.username; + case State::Password: + return m_result.password; + case State::Hostname: + return m_result.hostname; + case State::Port: + return m_result.port; + case State::Pathname: + return m_result.pathname; + case State::Search: + return m_result.search; + case State::Hash: + return m_result.hash; + case State::Initial: + case State::Authority: + case State::Done: + break; + } + VERIFY_NOT_REACHED(); +} + +void ConstructorStringParser::set_result_for_active_state(Optional value) +{ + switch (m_state) { + case State::Protocol: + m_result.protocol = move(value); + break; + case State::Username: + m_result.username = move(value); + break; + case State::Password: + m_result.password = move(value); + break; + case State::Hostname: + m_result.hostname = move(value); + break; + case State::Port: + m_result.port = move(value); + break; + case State::Pathname: + m_result.pathname = move(value); + break; + case State::Search: + m_result.search = move(value); + break; + case State::Hash: + m_result.hash = move(value); + break; + case State::Initial: + case State::Authority: + case State::Done: + VERIFY_NOT_REACHED(); + } +} + +// https://urlpattern.spec.whatwg.org/#change-state +void ConstructorStringParser::change_state(State new_state, u32 skip) +{ + // 1. If parser’s state is not "init", not "authority", and not "done", then set parser’s result[parser’s state] to + // the result of running make a component string given parser. + if (m_state != State::Initial && m_state != State::Authority && m_state != State::Done) + set_result_for_active_state(make_a_component_string()); + + // 2. If parser’s state is not "init" and new state is not "done", then: + if (m_state != State::Initial && new_state != State::Done) { + // 1. If parser’s state is "protocol", "authority", "username", or "password"; new state is "port", "pathname", + // "search", or "hash"; and parser’s result["hostname"] does not exist, then set parser’s result["hostname"] + // to the empty string. + if (first_is_one_of(m_state, State::Protocol, State::Authority, State::Username, State::Password) + && first_is_one_of(new_state, State::Port, State::Pathname, State::Search, State::Hash) + && !m_result.hostname.has_value()) { + m_result.hostname = String {}; + } + + // 2. If parser’s state is "protocol", "authority", "username", "password", "hostname", or "port"; new state is + // "search" or "hash"; and parser’s result["pathname"] does not exist, then: + if (first_is_one_of(m_state, State::Protocol, State::Authority, State::Username, State::Password, State::Hostname, State::Port) + && first_is_one_of(new_state, State::Search, State::Hash) + && !m_result.pathname.has_value()) { + // 1. If parser’s protocol matches a special scheme flag is true, then set parser’s result["pathname"] to "/". + if (m_protocol_matches_a_special_scheme) { + m_result.pathname = "/"_string; + } + // 2. Otherwise, set parser’s result["pathname"] to the empty string. + else { + m_result.pathname = String {}; + } + } + + // 3. If parser’s state is "protocol", "authority", "username", "password", "hostname", "port", or "pathname"; + // new state is "hash"; and parser’s result["search"] does not exist, then set parser’s result["search"] + // to the empty string. + if (first_is_one_of(m_state, State::Protocol, State::Authority, State::Username, State::Password, State::Hostname, State::Port, State::Pathname) + && new_state == State::Hash + && !m_result.search.has_value()) { + m_result.search = String {}; + } + } + + // 3. Set parser’s state to new state. + m_state = new_state; + + // 4. Increment parser’s token index by skip. + m_token_index += skip; + + // 5. Set parser’s component start to parser’s token index. + m_component_start = m_token_index; + + // 6. Set parser’s token increment to 0. + m_token_increment = 0; +} + +// https://urlpattern.spec.whatwg.org/#next-is-authority-slashes +bool ConstructorStringParser::next_is_authority_slashes() const +{ + // 1. If the result of running is a non-special pattern char given parser, parser’s token index + 1, and "/" is false, + // then return false. + if (!is_a_non_special_pattern_char(m_token_index + 1, '/')) + return false; + + // 2. If the result of running is a non-special pattern char given parser, parser’s token index + 2, and "/" is false, + // then return false. + if (!is_a_non_special_pattern_char(m_token_index + 2, '/')) + return false; + + // 3. Return true. + return true; +} + +// https://urlpattern.spec.whatwg.org/#is-an-identity-terminator +bool ConstructorStringParser::is_an_identity_terminator() const +{ + // 1. Return the result of running is a non-special pattern char given parser, parser’s token index, and "@". + return is_a_non_special_pattern_char(m_token_index, '@'); +} + +// https://urlpattern.spec.whatwg.org/#is-a-password-prefix +bool ConstructorStringParser::is_a_password_prefix() const +{ + // 1. Return the result of running is a non-special pattern char given parser, parser’s token index, and ":". + return is_a_non_special_pattern_char(m_token_index, ':'); +} + +// https://urlpattern.spec.whatwg.org/#is-a-port-prefix +bool ConstructorStringParser::is_a_port_prefix() const +{ + // 1. Return the result of running is a non-special pattern char given parser, parser’s token index, and ":". + return is_a_non_special_pattern_char(m_token_index, ':'); +} + +// https://urlpattern.spec.whatwg.org/#is-a-pathname-start +bool ConstructorStringParser::is_a_pathname_start() const +{ + // 1. Return the result of running is a non-special pattern char given parser, parser’s token index, and "/". + return is_a_non_special_pattern_char(m_token_index, '/'); +} + +// https://urlpattern.spec.whatwg.org/#is-a-search-prefix +bool ConstructorStringParser::is_a_search_prefix() const +{ + // 1. If result of running is a non-special pattern char given parser, parser’s token index and "?" is true, + // then return true. + if (is_a_non_special_pattern_char(m_token_index, '?')) + return true; + + // 2. If parser’s token list[parser’s token index]'s value is not "?", then return false. + if (m_token_list[m_token_index].value != "?"sv) + return false; + + // 3. Let previous index be parser’s token index − 1. + // 4. If previous index is less than 0, then return true. + if (m_token_index == 0) + return true; + auto previous_index = m_token_index - 1; + + // 5. Let previous token be the result of running get a safe token given parser and previous index. + auto const& previous_token = get_a_safe_token(previous_index); + + // 6. If any of the following are true, then return false: + // * previous token’s type is "name". + // * previous token’s type is "regexp". + // * previous token’s type is "close". + // * previous token’s type is "asterisk". + if (previous_token.type == Token::Type::Name + || previous_token.type == Token::Type::Regexp + || previous_token.type == Token::Type::Close + || previous_token.type == Token::Type::Asterisk) { + return false; + } + + // 7. Return true. + return true; +} + +// https://urlpattern.spec.whatwg.org/#is-a-protocol-suffix +bool ConstructorStringParser::is_a_protocol_suffix() const +{ + // 1. Return the result of running is a non-special pattern char given parser, parser’s token index, and ":". + return is_a_non_special_pattern_char(m_token_index, ':'); +} + +// https://urlpattern.spec.whatwg.org/#is-a-hash-prefix +bool ConstructorStringParser::is_a_hash_prefix() const +{ + // 1. Return the result of running is a non-special pattern char given parser, parser’s token index and "#". + return is_a_non_special_pattern_char(m_token_index, '#'); +} + +// https://urlpattern.spec.whatwg.org/#is-a-group-open +bool ConstructorStringParser::is_a_group_open() const +{ + // 1. If parser’s token list[parser’s token index]'s type is "open", then return true. + if (m_token_list[m_token_index].type == Token::Type::Open) + return true; + + // 2. Otherwise return false. + return false; +} + +// https://urlpattern.spec.whatwg.org/#is-a-group-close +bool ConstructorStringParser::is_a_group_close() const +{ + // 1. If parser’s token list[parser’s token index]'s type is "close", then return true. + if (m_token_list[m_token_index].type == Token::Type::Close) + return true; + + // 2. Otherwise return false. + return false; +} + +// https://urlpattern.spec.whatwg.org/#is-an-ipv6-open +bool ConstructorStringParser::is_an_ipv6_open() const +{ + // 1. Return the result of running is a non-special pattern char given parser, parser’s token index, and "[". + return is_a_non_special_pattern_char(m_token_index, '['); +} + +// https://urlpattern.spec.whatwg.org/#is-an-ipv6-close +bool ConstructorStringParser::is_an_ipv6_close() const +{ + // 1. Return the result of running is a non-special pattern char given parser, parser’s token index, and "]". + return is_a_non_special_pattern_char(m_token_index, ']'); +} + +// https://urlpattern.spec.whatwg.org/#get-a-safe-token +Token const& ConstructorStringParser::get_a_safe_token(u32 index) const +{ + // 1. If index is less than parser’s token list's size, then return parser’s token list[index]. + if (index < m_token_list.size()) + return m_token_list[index]; + + // 2. Assert: parser’s token list's size is greater than or equal to 1. + VERIFY(!m_token_list.is_empty()); + + // 3. Let last index be parser’s token list's size − 1. + // 4. Let token be parser’s token list[last index]. + auto const& token = m_token_list.last(); + + // 5. Assert: token’s type is "end". + VERIFY(token.type == Token::Type::End); + + // 6. Return token. + return token; +} + +// https://urlpattern.spec.whatwg.org/#is-a-non-special-pattern-char +bool ConstructorStringParser::is_a_non_special_pattern_char(u32 index, char value) const +{ + // 1. Let token be the result of running get a safe token given parser and index. + auto const& token = get_a_safe_token(index); + + // 2. If token’s value is not value, then return false. + if (token.value.is_empty() || token.value.bytes().first() != value) + return false; + + // 3. If any of the following are true: + // * token’s type is "char"; + // * token’s type is "escaped-char"; or + // * token’s type is "invalid-char", + // then return true. + if (token.type == Token::Type::Char + || token.type == Token::Type::EscapedChar + || token.type == Token::Type::InvalidChar) { + return true; + } + + // 4. Return false. + return false; +} + +// https://urlpattern.spec.whatwg.org/#rewind +void ConstructorStringParser::rewind() +{ + // 1. Set parser’s token index to parser’s component start. + m_token_index = m_component_start; + + // 2. Set parser’s token increment to 0. + m_token_increment = 0; +} + +// https://urlpattern.spec.whatwg.org/#rewind-and-set-state +void ConstructorStringParser::rewind_and_set_state(State state) +{ + // 1. Run rewind given parser. + rewind(); + + // 2. Set parser’s state to state. + m_state = state; +} + +} diff --git a/Libraries/LibURL/Pattern/ConstructorStringParser.h b/Libraries/LibURL/Pattern/ConstructorStringParser.h new file mode 100644 index 00000000000..baac22dd125 --- /dev/null +++ b/Libraries/LibURL/Pattern/ConstructorStringParser.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2025, Shannon Booth + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include + +namespace URL::Pattern { + +// https://urlpattern.spec.whatwg.org/#constructor-string-parser +class ConstructorStringParser { +public: + static PatternErrorOr parse(Utf8View const& input); + +private: + ConstructorStringParser(Utf8View const& input, Vector token_list); + + // https://urlpattern.spec.whatwg.org/#constructor-string-parser-state + enum class State { + Initial, + Protocol, + Authority, + Username, + Password, + Hostname, + Port, + Pathname, + Search, + Hash, + Done, + }; + StringView state_to_string() const; + + void rewind(); + void rewind_and_set_state(State); + + bool next_is_authority_slashes() const; + + bool is_an_identity_terminator() const; + bool is_a_port_prefix() const; + bool is_a_pathname_start() const; + bool is_a_password_prefix() const; + bool is_a_search_prefix() const; + bool is_a_hash_prefix() const; + bool is_a_protocol_suffix() const; + bool is_an_ipv6_open() const; + bool is_an_ipv6_close() const; + bool is_a_group_open() const; + bool is_a_group_close() const; + + Token const& get_a_safe_token(u32 index) const; + bool is_a_non_special_pattern_char(u32 index, char value) const; + void change_state(State, u32 skip); + String make_a_component_string() const; + PatternErrorOr compute_protocol_matches_a_special_scheme_flag(); + + Optional const& result_for_active_state() const; + void set_result_for_active_state(Optional value); + + // https://urlpattern.spec.whatwg.org/#constructor-string-parser-input + // A constructor string parser has an associated input, a string, which must be set upon creation. + Utf8View m_input; + + // https://urlpattern.spec.whatwg.org/#constructor-string-parser-token-list + // A constructor string parser has an associated token list, a token list, which must be set upon creation. + Vector m_token_list; + + // https://urlpattern.spec.whatwg.org/#constructor-string-parser-result + // A constructor string parser has an associated result, a URLPatternInit, initially set to a new URLPatternInit. + Init m_result; + + // https://urlpattern.spec.whatwg.org/#constructor-string-parser-component-start + // A constructor string parser has an associated component start, a number, initially set to 0. + u32 m_component_start { 0 }; + + // https://urlpattern.spec.whatwg.org/#constructor-string-parser-token-index + // A constructor string parser has an associated token index, a number, initially set to 0. + u32 m_token_index { 0 }; + + // https://urlpattern.spec.whatwg.org/#constructor-string-parser-token-increment + // A constructor string parser has an associated token increment, a number, initially set to 1. + u32 m_token_increment { 1 }; + + // https://urlpattern.spec.whatwg.org/#constructor-string-parser-group-depth + // A constructor string parser has an associated group depth, a number, initially set to 0. + u32 m_group_depth { 0 }; + + // https://urlpattern.spec.whatwg.org/#constructor-string-parser-hostname-ipv6-bracket-depth + // A constructor string parser has an associated hostname IPv6 bracket depth, a number, initially set to 0. + u32 m_hostname_ipv6_bracket_depth { 0 }; + + // https://urlpattern.spec.whatwg.org/#constructor-string-parser-protocol-matches-a-special-scheme-flag + // A constructor string parser has an associated protocol matches a special scheme flag, a boolean, initially set to false. + bool m_protocol_matches_a_special_scheme { false }; + + // https://urlpattern.spec.whatwg.org/#constructor-string-parser-state + // A constructor string parser has an associated state, a string, initially set to "init". + State m_state { State::Initial }; +}; + +} diff --git a/Tests/LibURL/CMakeLists.txt b/Tests/LibURL/CMakeLists.txt index 944d65cdc84..48069b1cee3 100644 --- a/Tests/LibURL/CMakeLists.txt +++ b/Tests/LibURL/CMakeLists.txt @@ -1,5 +1,6 @@ set(URL_TEST_SOURCES TestURL.cpp + TestURLPatternConstructorStringParser.cpp ) foreach(source IN LISTS URL_TEST_SOURCES) diff --git a/Tests/LibURL/TestURLPatternConstructorStringParser.cpp b/Tests/LibURL/TestURLPatternConstructorStringParser.cpp new file mode 100644 index 00000000000..e3b8806b677 --- /dev/null +++ b/Tests/LibURL/TestURLPatternConstructorStringParser.cpp @@ -0,0 +1,174 @@ +/* + * Copyright (c) 2025, Shannon Booth + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include + +#include + +TEST_CASE(basic_http_url_no_pattern_or_path) +{ + auto input = "http://www.serenityos.org"_string; + auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points())); + EXPECT_EQ(result.protocol, "http"sv); + EXPECT_EQ(result.username, OptionalNone {}); + EXPECT_EQ(result.password, OptionalNone {}); + EXPECT_EQ(result.hostname, "www.serenityos.org"sv); + EXPECT_EQ(result.port, ""sv); + EXPECT_EQ(result.pathname, OptionalNone {}); + EXPECT_EQ(result.search, OptionalNone {}); + EXPECT_EQ(result.hash, OptionalNone {}); + EXPECT_EQ(result.base_url, OptionalNone {}); +} + +TEST_CASE(pathname_with_regexp) +{ + auto input = "/books/(\\d+)"_string; + auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points())); + EXPECT_EQ(result.protocol, OptionalNone {}); + EXPECT_EQ(result.username, OptionalNone {}); + EXPECT_EQ(result.password, OptionalNone {}); + EXPECT_EQ(result.hostname, OptionalNone {}); + EXPECT_EQ(result.port, OptionalNone {}); + EXPECT_EQ(result.pathname, "/books/(\\d+)"sv); + EXPECT_EQ(result.search, OptionalNone {}); + EXPECT_EQ(result.hash, OptionalNone {}); + EXPECT_EQ(result.base_url, OptionalNone {}); +} + +TEST_CASE(url_with_pathname_and_regexp) +{ + auto input = "https://example.com/2022/feb/*"_string; + auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points())); + EXPECT_EQ(result.protocol, "https"sv); + EXPECT_EQ(result.username, OptionalNone {}); + EXPECT_EQ(result.password, OptionalNone {}); + EXPECT_EQ(result.hostname, "example.com"sv); + EXPECT_EQ(result.port, ""sv); + EXPECT_EQ(result.pathname, "/2022/feb/*"sv); + EXPECT_EQ(result.search, OptionalNone {}); + EXPECT_EQ(result.hash, OptionalNone {}); + EXPECT_EQ(result.base_url, OptionalNone {}); +} + +TEST_CASE(http_url_regexp_in_pathname_and_hostname) +{ + auto input = "https://cdn-*.example.com/*.jpg"_string; + auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points())); + EXPECT_EQ(result.protocol, "https"sv); + EXPECT_EQ(result.username, OptionalNone {}); + EXPECT_EQ(result.password, OptionalNone {}); + EXPECT_EQ(result.hostname, "cdn-*.example.com"sv); + EXPECT_EQ(result.port, ""sv); + EXPECT_EQ(result.pathname, "/*.jpg"sv); + EXPECT_EQ(result.search, OptionalNone {}); + EXPECT_EQ(result.hash, OptionalNone {}); + EXPECT_EQ(result.base_url, OptionalNone {}); +} + +TEST_CASE(https_url_with_fragment) +{ + auto input = "https://example.com/#foo"_string; + auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points())); + EXPECT_EQ(result.protocol, "https"sv); + EXPECT_EQ(result.username, OptionalNone {}); + EXPECT_EQ(result.password, OptionalNone {}); + EXPECT_EQ(result.hostname, "example.com"sv); + EXPECT_EQ(result.port, ""sv); + EXPECT_EQ(result.pathname, "/"sv); + EXPECT_EQ(result.search, ""sv); + EXPECT_EQ(result.hash, "foo"sv); + EXPECT_EQ(result.base_url, OptionalNone {}); +} + +TEST_CASE(http_url_with_query) +{ + auto input = "https://example.com/?q=*&v=?&hmm={}&umm=()"_string; + auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points())); + EXPECT_EQ(result.protocol, "https"sv); + EXPECT_EQ(result.username, OptionalNone {}); + EXPECT_EQ(result.password, OptionalNone {}); + EXPECT_EQ(result.hostname, "example.com"sv); + EXPECT_EQ(result.port, ""sv); + EXPECT_EQ(result.pathname, "/"sv); + EXPECT_EQ(result.search, "q=*&v=?&hmm={}&umm=()"sv); + EXPECT_EQ(result.hash, OptionalNone {}); + EXPECT_EQ(result.base_url, OptionalNone {}); +} + +TEST_CASE(matches_on_sub_url) +{ + auto input = "https://{sub.}?example.com/foo"_string; + auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points())); + EXPECT_EQ(result.protocol, "https"sv); + EXPECT_EQ(result.username, OptionalNone {}); + EXPECT_EQ(result.password, OptionalNone {}); + EXPECT_EQ(result.hostname, "{sub.}?example.com"sv); + EXPECT_EQ(result.port, ""sv); + EXPECT_EQ(result.pathname, "/foo"sv); + EXPECT_EQ(result.search, OptionalNone {}); + EXPECT_EQ(result.hash, OptionalNone {}); + EXPECT_EQ(result.base_url, OptionalNone {}); +} + +TEST_CASE(ipv6_with_port_number) +{ + auto input = "http://[\\:\\:1]:8080"_string; + auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points())); + EXPECT_EQ(result.protocol, "http"sv); + EXPECT_EQ(result.username, OptionalNone {}); + EXPECT_EQ(result.password, OptionalNone {}); + EXPECT_EQ(result.hostname, "[\\:\\:1]"sv); + EXPECT_EQ(result.port, "8080"sv); + EXPECT_EQ(result.pathname, OptionalNone {}); + EXPECT_EQ(result.search, OptionalNone {}); + EXPECT_EQ(result.hash, OptionalNone {}); + EXPECT_EQ(result.base_url, OptionalNone {}); +} + +TEST_CASE(data_url) +{ + auto input = "data\\:foobar"_string; + auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points())); + EXPECT_EQ(result.protocol, "data"sv); + EXPECT_EQ(result.username, OptionalNone {}); + EXPECT_EQ(result.password, OptionalNone {}); + EXPECT_EQ(result.hostname, ""sv); + EXPECT_EQ(result.port, ""sv); + EXPECT_EQ(result.pathname, "foobar"sv); + EXPECT_EQ(result.search, OptionalNone {}); + EXPECT_EQ(result.hash, OptionalNone {}); + EXPECT_EQ(result.base_url, OptionalNone {}); +} + +TEST_CASE(non_special_scheme_and_arbitary_hostname) +{ + auto input = "foo://bar"_string; + auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points())); + EXPECT_EQ(result.protocol, "foo"sv); + EXPECT_EQ(result.username, OptionalNone {}); + EXPECT_EQ(result.password, OptionalNone {}); + EXPECT_EQ(result.hostname, "bar"sv); + EXPECT_EQ(result.port, ""sv); + EXPECT_EQ(result.pathname, OptionalNone {}); + EXPECT_EQ(result.search, OptionalNone {}); + EXPECT_EQ(result.hash, OptionalNone {}); + EXPECT_EQ(result.base_url, OptionalNone {}); +} + +TEST_CASE(ipv6_with_named_group) +{ + auto input = "http://[:address]/"_string; + auto result = MUST(URL::Pattern::ConstructorStringParser::parse(input.code_points())); + EXPECT_EQ(result.protocol, "http"sv); + EXPECT_EQ(result.username, OptionalNone {}); + EXPECT_EQ(result.password, OptionalNone {}); + EXPECT_EQ(result.hostname, "[:address]"sv); + EXPECT_EQ(result.port, ""sv); + EXPECT_EQ(result.pathname, "/"sv); + EXPECT_EQ(result.search, OptionalNone {}); + EXPECT_EQ(result.hash, OptionalNone {}); + EXPECT_EQ(result.base_url, OptionalNone {}); +}