ladybird/Libraries/LibURL/Pattern/Tokenizer.cpp
Shannon Booth f8f21319f9 LibURL/Pattern: Implement the URL Pattern Tokenizer
The tokenizer is used for both pattern string and constructor string
parsing of URL Patterns.
2025-03-15 07:39:03 -04:00

440 lines
18 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
* Copyright (c) 2025, Shannon Booth <shannon@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Debug.h>
#include <LibURL/Pattern/Tokenizer.h>
#include <LibUnicode/CharacterTypes.h>
namespace URL::Pattern {
StringView Token::type_to_string(Token::Type type)
{
switch (type) {
case Token::Type::Open:
return "Open"sv;
case Token::Type::Close:
return "Close"sv;
case Token::Type::Regexp:
return "Regexp"sv;
case Token::Type::Name:
return "Name"sv;
case Token::Type::Char:
return "Char"sv;
case Token::Type::EscapedChar:
return "EscapedChar"sv;
case Token::Type::OtherModifier:
return "OtherModifier"sv;
case Token::Type::Asterisk:
return "Asterisk"sv;
case Token::Type::End:
return "End"sv;
case Token::Type::InvalidChar:
return "InvalidChar"sv;
}
VERIFY_NOT_REACHED();
}
String Token::to_string() const
{
return MUST(String::formatted("{}, index: {}, value: '{}'", type_to_string(type), index, value));
}
Tokenizer::Tokenizer(Utf8View const& input, Policy policy)
: m_input(input)
, m_policy(policy)
{
}
// https://urlpattern.spec.whatwg.org/#tokenize
PatternErrorOr<Vector<Token>> Tokenizer::tokenize(Utf8View const& input, Tokenizer::Policy policy)
{
dbgln_if(URL_PATTERN_DEBUG, "URLPattern tokenizing input: '{}'", input.as_string());
VERIFY(input.validate());
// 1. Let tokenizer be a new tokenizer.
// 2. Set tokenizers input to input.
// 3. Set tokenizers policy to policy.
Tokenizer tokenizer { input, policy };
// 4. While tokenizers index is less than tokenizers input's code point length:
while (tokenizer.m_index < tokenizer.m_input.length()) {
// 1. Run seek and get the next code point given tokenizer and tokenizers index.
tokenizer.seek_and_get_the_next_code_point(tokenizer.m_index);
// 2. If tokenizers code point is U+002A (*):
if (tokenizer.m_code_point == '*') {
// 1. Run add a token with default position and length given tokenizer and "asterisk".
tokenizer.add_a_token_with_default_position_and_length(Token::Type::Asterisk);
// 2. Continue.
continue;
}
// 3. If tokenizers code point is U+002B (+) or U+003F (?):
if (tokenizer.m_code_point == '+' || tokenizer.m_code_point == '?') {
// 1. Run add a token with default position and length given tokenizer and "other-modifier".
tokenizer.add_a_token_with_default_position_and_length(Token::Type::OtherModifier);
// 2. Continue.
continue;
}
// 4. If tokenizers code point is U+005C (\):
if (tokenizer.m_code_point == '\\') {
// 1. If tokenizers index is equal to tokenizers input's code point length 1:
if (tokenizer.m_index == tokenizer.m_input.length() - 1) {
// 1. Run process a tokenizing error given tokenizer, tokenizers next index, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(tokenizer.m_next_index, tokenizer.m_index));
// 2. Continue.
continue;
}
// 2. Let escaped index be tokenizers next index.
auto escaped_index = tokenizer.m_next_index;
// 3. Run get the next code point given tokenizer.
tokenizer.get_the_next_code_point();
// 4. Run add a token with default length given tokenizer, "escaped-char", tokenizers next index, and escaped index.
tokenizer.add_a_token_with_default_length(Token::Type::EscapedChar, tokenizer.m_next_index, escaped_index);
// 5. Continue.
continue;
}
// 5. If tokenizers code point is U+007B ({):
if (tokenizer.m_code_point == '{') {
// 1. Run add a token with default position and length given tokenizer and "open".
tokenizer.add_a_token_with_default_position_and_length(Token::Type::Open);
// 2. Continue.
continue;
}
// 6. If tokenizers code point is U+007D (}):
if (tokenizer.m_code_point == '}') {
// 1. Run add a token with default position and length given tokenizer and "close".
tokenizer.add_a_token_with_default_position_and_length(Token::Type::Close);
// 2. Continue.
continue;
}
// 1. If tokenizers code point is U+003A (:):
if (tokenizer.m_code_point == ':') {
// 1. Let name position be tokenizers next index.
auto name_position = tokenizer.m_next_index;
// 2. Let name start be name position.
auto name_start = name_position;
// 3. While name position is less than tokenizers input's code point length:
while (name_position < tokenizer.m_input.length()) {
// 1. Run seek and get the next code point given tokenizer and name position.
tokenizer.seek_and_get_the_next_code_point(name_position);
// 2. Let first code point be true if name position equals name start and false otherwise.
bool first_code_point = name_position == name_start;
// 3. Let valid code point be the result of running is a valid name code point given tokenizers code point and first code point.
bool valid_code_point = is_a_valid_name_code_point(tokenizer.m_code_point, first_code_point);
// 4. If valid code point is false break.
if (!valid_code_point)
break;
// 5. Set name position to tokenizers next index.
name_position = tokenizer.m_next_index;
}
// 4. If name position is less than or equal to name start:
if (name_position <= name_start) {
// 1. Run process a tokenizing error given tokenizer, name start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(name_start, tokenizer.m_index));
// 2. Continue.
continue;
}
// 5. Run add a token with default length given tokenizer, "name", name position, and name start.
tokenizer.add_a_token_with_default_length(Token::Type::Name, name_position, name_start);
// 6. Continue.
continue;
}
// 8. If tokenizers code point is U+0028 (():
if (tokenizer.m_code_point == '(') {
// 1. Let depth be 1.
u32 depth = 1;
// 2. Let regexp position be tokenizers next index.
auto regexp_position = tokenizer.m_next_index;
// 3. Let regexp start be regexp position.
auto regexp_start = regexp_position;
// 4. Let error be false.
bool error = false;
// 5. While regexp position is less than tokenizers input's code point length:
while (regexp_position < tokenizer.m_input.length()) {
// 1. Run seek and get the next code point given tokenizer and regexp position.
tokenizer.seek_and_get_the_next_code_point(regexp_position);
// 2. If the result of running is ASCII given tokenizers code point is false:
if (!is_ascii(tokenizer.m_code_point)) {
// 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
// 2. Set error to true.
error = true;
// 3. Break.
break;
}
// 3. If regexp position equals regexp start and tokenizers code point is U+003F (?):
if (regexp_position == regexp_start && tokenizer.m_code_point == '?') {
// 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
// 2. Set error to true.
error = true;
// 3. Break.
break;
}
// 4. If tokenizers code point is U+005C (\):
if (tokenizer.m_code_point == '\\') {
// 1. If regexp position equals tokenizers input's code point length 1:
if (regexp_position == tokenizer.m_input.length() - 1) {
// 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
// 2. Set error to true.
error = true;
// 3. Break
break;
}
// 2. Run get the next code point given tokenizer.
tokenizer.get_the_next_code_point();
// 3. If the result of running is ASCII given tokenizers code point is false:
if (!is_ascii(tokenizer.m_code_point)) {
// 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
// 2. Set error to true.
error = true;
// 3. Break.
break;
}
// 4. Set regexp position to tokenizers next index.
regexp_position = tokenizer.m_next_index;
// 5. Continue.
continue;
}
// 5. If tokenizers code point is U+0029 ()):
if (tokenizer.m_code_point == ')') {
// 1. Decrement depth by 1.
--depth;
// 1. If depth is 0:
if (depth == 0) {
// 1. Set regexp position to tokenizers next index.
regexp_position = tokenizer.m_next_index;
// 2. Break.
break;
}
}
// 6. Otherwise if tokenizers code point is U+0028 (():
else if (tokenizer.m_code_point == '(') {
// 1. Increment depth by 1.
++depth;
// 2. If regexp position equals tokenizers input's code point length 1:
if (regexp_position == tokenizer.m_input.length() - 1) {
// 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
// 2. Set error to true.
error = true;
// 3. Break
break;
}
// 3. Let temporary position be tokenizers next index.
auto temporary_position = tokenizer.m_next_index;
// 4. Run get the next code point given tokenizer.
tokenizer.get_the_next_code_point();
// 5. If tokenizers code point is not U+003F (?):
if (tokenizer.m_code_point != '?') {
// 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
// 2. Set error to true.
error = true;
// 3. Break.
break;
}
// 6. Set tokenizers next index to temporary position.
tokenizer.m_next_index = temporary_position;
}
// 7. Set regexp position to tokenizers next index.
regexp_position = tokenizer.m_next_index;
}
// 6. If error is true continue.
if (error)
continue;
// 7. If depth is not zero:
if (depth != 0) {
// 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
// 2. Continue.
continue;
}
// 8. Let regexp length be regexp position regexp start 1.
auto regexp_length = regexp_position - regexp_start - 1;
// 9. If regexp length is zero:
if (regexp_length == 0) {
// 1. Run process a tokenizing error given tokenizer, regexp start, and tokenizers index.
TRY(tokenizer.process_a_tokenizing_error(regexp_start, tokenizer.m_index));
// 2. Continue.
continue;
}
// 10. Run add a token given tokenizer, "regexp", regexp position, regexp start, and regexp length.
tokenizer.add_a_token(Token::Type::Regexp, regexp_position, regexp_start, regexp_length);
// 11. Continue.
continue;
}
// 9. Run add a token with default position and length given tokenizer and "char".
tokenizer.add_a_token_with_default_position_and_length(Token::Type::Char);
}
// 5. Run add a token with default length given tokenizer, "end", tokenizers index, and tokenizers index.
tokenizer.add_a_token_with_default_length(Token::Type::End, tokenizer.m_index, tokenizer.m_index);
// 6. Return tokenizers token list.
if constexpr (URL_PATTERN_DEBUG) {
for (auto const& token : tokenizer.m_token_list)
dbgln("{}", token.to_string());
}
return tokenizer.m_token_list;
}
// https://urlpattern.spec.whatwg.org/#get-the-next-code-point
void Tokenizer::get_the_next_code_point()
{
// 1. Set tokenizers code point to the Unicode code point in tokenizers input at the position indicated by tokenizers next index.
m_code_point = *m_input.unicode_substring_view(m_next_index, 1).begin();
// 2. Increment tokenizers next index by 1.
++m_next_index;
}
// https://urlpattern.spec.whatwg.org/#seek-and-get-the-next-code-point
void Tokenizer::seek_and_get_the_next_code_point(u32 index)
{
// 1. Set tokenizers next index to index.
m_next_index = index;
// 2. Run get the next code point given tokenizer.
get_the_next_code_point();
}
// https://urlpattern.spec.whatwg.org/#add-a-token
void Tokenizer::add_a_token(Token::Type type, u32 next_position, u32 value_position, u32 value_length)
{
// 1. Let token be a new token.
Token token;
// 2. Set tokens type to type.
token.type = type;
// 3. Set tokens index to tokenizers index.
token.index = m_index;
// 4. Set tokens value to the code point substring from value position with length value length within tokenizers input.
token.value = MUST(String::from_utf8(m_input.unicode_substring_view(value_position, value_length).as_string()));
// 5. Append token to the back of tokenizers token list.
m_token_list.append(move(token));
// 5. Set tokenizers index to next position.
m_index = next_position;
}
// https://urlpattern.spec.whatwg.org/#add-a-token-with-default-length
void Tokenizer::add_a_token_with_default_length(Token::Type type, u32 next_position, u32 value_position)
{
// 1. Let computed length be next position value position.
auto computed_length = next_position - value_position;
// 2. Run add a token given tokenizer, type, next position, value position, and computed length.
add_a_token(type, next_position, value_position, computed_length);
}
// https://urlpattern.spec.whatwg.org/#add-a-token-with-default-position-and-length
void Tokenizer::add_a_token_with_default_position_and_length(Token::Type type)
{
// 1. Run add a token with default length given tokenizer, type, tokenizers next index, and tokenizers index.
add_a_token_with_default_length(type, m_next_index, m_index);
}
// https://urlpattern.spec.whatwg.org/#process-a-tokenizing-error
PatternErrorOr<void> Tokenizer::process_a_tokenizing_error(u32 next_position, u32 value_position)
{
// 1. If tokenizers policy is "strict", then throw a TypeError.
if (m_policy == Policy::Strict)
return ErrorInfo { "Error processing a token"_string }; // FIXME: Improve this error!
// 2. Assert: tokenizers policy is "lenient".
VERIFY(m_policy == Policy::Lenient);
// 3. Run add a token with default length given tokenizer, "invalid-char", next position, and value position.
add_a_token_with_default_length(Token::Type::InvalidChar, next_position, value_position);
return {};
}
// https://urlpattern.spec.whatwg.org/#is-a-valid-name-code-point
bool Tokenizer::is_a_valid_name_code_point(u32 code_point, bool first)
{
// 1. If first is true return the result of checking if code point is contained in the IdentifierStart set of code points.
if (first)
return code_point == '$' || code_point == '_' || Unicode::code_point_has_identifier_start_property(code_point);
// 2. Otherwise return the result of checking if code point is contained in the IdentifierPart set of code points.
return code_point == '$' || Unicode::code_point_has_identifier_continue_property(code_point);
}
}