From c9e6ad562cbcc0b8fd978e8ccce6ad068a62b632 Mon Sep 17 00:00:00 2001 From: Shannon Booth Date: Tue, 18 Mar 2025 19:34:25 +1300 Subject: [PATCH] LibURL/Pattern: Implement ability to compile a component This provides the infrastructure for taking a part list from the pattern parser and generating the actual regexp object which is used for matching against URLs from the pattern. --- Libraries/LibURL/CMakeLists.txt | 1 + Libraries/LibURL/Pattern/Component.cpp | 257 +++++++++++++++++++++++++ Libraries/LibURL/Pattern/Component.h | 3 + 3 files changed, 261 insertions(+) create mode 100644 Libraries/LibURL/Pattern/Component.cpp diff --git a/Libraries/LibURL/CMakeLists.txt b/Libraries/LibURL/CMakeLists.txt index bb709e6bda5..c1a5e1199c1 100644 --- a/Libraries/LibURL/CMakeLists.txt +++ b/Libraries/LibURL/CMakeLists.txt @@ -8,6 +8,7 @@ set(SOURCES URL.cpp ${PUBLIC_SUFFIX_SOURCES} Pattern/Canonicalization.cpp + Pattern/Component.cpp Pattern/ConstructorStringParser.cpp Pattern/Init.cpp Pattern/Options.cpp diff --git a/Libraries/LibURL/Pattern/Component.cpp b/Libraries/LibURL/Pattern/Component.cpp new file mode 100644 index 00000000000..d6d649e6c7a --- /dev/null +++ b/Libraries/LibURL/Pattern/Component.cpp @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2025, Shannon Booth + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include +#include + +namespace URL::Pattern { + +// https://urlpattern.spec.whatwg.org/#generate-a-regular-expression-and-name-list +struct RegularExpressionAndNameList { + String regular_expression; + Vector name_list; +}; + +static RegularExpressionAndNameList generate_a_regular_expression_and_name_list(Vector const& part_list, Options const& options) +{ + // 1. Let result be "^". + StringBuilder result; + result.append('^'); + + // 2. Let name list be a new list. + Vector name_list; + + // 3. For each part of part list: + for (auto const& part : part_list) { + // 1. If part’s type is "fixed-text": + if (part.type == Part::Type::FixedText) { + // 1. If part’s modifier is "none", then append the result of running escape a regexp string given part’s + // value to the end of result. + if (part.modifier == Part::Modifier::None) { + result.append(escape_a_regexp_string(part.value)); + } + // 2. Otherwise: + else { + // 1. Append "(?:" to the end of result. + result.append("(?:"sv); + + // 2. Append the result of running escape a regexp string given part’s value to the end of result. + result.append(escape_a_regexp_string(part.value)); + + // 3. Append ")" to the end of result. + result.append(')'); + + // 4. Append the result of running convert a modifier to a string given part’s modifier to the end of result. + result.append(Part::convert_modifier_to_string(part.modifier)); + } + + // 3. Continue. + continue; + } + + // 2. Assert: part’s name is not the empty string. + VERIFY(!part.name.is_empty()); + + // 3. Append part’s name to name list. + name_list.append(part.name); + + // 4. Let regexp value be part’s value. + auto regexp_value = part.value; + + // 5. If part’s type is "segment-wildcard", then set regexp value to the result of running generate a segment wildcard regexp given options. + if (part.type == Part::Type::SegmentWildcard) { + regexp_value = generate_a_segment_wildcard_regexp(options); + } + // 6. Otherwise if part’s type is "full-wildcard", then set regexp value to full wildcard regexp value. + else if (part.type == Part::Type::FullWildcard) { + regexp_value = MUST(String::from_utf8(full_wildcard_regexp_value)); + } + + // 7. If part’s prefix is the empty string and part’s suffix is the empty string: + if (part.prefix.is_empty() && part.suffix.is_empty()) { + // 1. If part’s modifier is "none" or "optional", then: + if (part.modifier == Part::Modifier::None || part.modifier == Part::Modifier::Optional) { + // 1. Append "(" to the end of result. + result.append('('); + + // 2. Append regexp value to the end of result. + result.append(regexp_value); + + // 3. Append ")" to the end of result. + result.append(')'); + + // 4. Append the result of running convert a modifier to a string given part’s modifier to the end of result. + result.append(Part::convert_modifier_to_string(part.modifier)); + } + // 2. Otherwise: + else { + // 1. Append "((?:" to the end of result. + result.append("((?:"sv); + + // 2. Append regexp value to the end of result. + result.append(regexp_value); + + // 3. Append ")" to the end of result. + result.append(')'); + + // 4. Append the result of running convert a modifier to a string given part’s modifier to the end of result. + result.append(Part::convert_modifier_to_string(part.modifier)); + + // 5. Append ")" to the end of result. + result.append(')'); + } + + // 3. Continue. + continue; + } + + // 8. If part’s modifier is "none" or "optional": + if (part.modifier == Part::Modifier::None || part.modifier == Part::Modifier::Optional) { + // 1. Append "(?:" to the end of result. + result.append("(?:"sv); + + // 2. Append the result of running escape a regexp string given part’s prefix to the end of result. + result.append(escape_a_regexp_string(part.prefix)); + + // 3. Append "(" to the end of result. + result.append('('); + + // 4. Append regexp value to the end of result. + result.append(regexp_value); + + // 5. Append ")" to the end of result. + result.append(')'); + + // 6. Append the result of running escape a regexp string given part’s suffix to the end of result. + result.append(escape_a_regexp_string(part.suffix)); + + // 7. Append ")" to the end of result. + result.append(')'); + + // 8. Append the result of running convert a modifier to a string given part’s modifier to the end of result. + result.append(Part::convert_modifier_to_string(part.modifier)); + + // 9. Continue. + continue; + } + + // 9. Assert: part’s modifier is "zero-or-more" or "one-or-more". + VERIFY(part.modifier == Part::Modifier::ZeroOrMore || part.modifier == Part::Modifier::OneOrMore); + + // 10. Assert: part’s prefix is not the empty string or part’s suffix is not the empty string. + VERIFY(!part.prefix.is_empty() || !part.suffix.is_empty()); + + // 11. Append "(?:" to the end of result. + result.append("(?:"sv); + + // 12. Append the result of running escape a regexp string given part’s prefix to the end of result. + result.append(escape_a_regexp_string(part.prefix)); + + // 13. Append "((?:" to the end of result. + result.append("((?:"sv); + + // 14. Append regexp value to the end of result. + result.append(regexp_value); + + // 15. Append ")(?:" to the end of result. + result.append(")(?:"sv); + + // 16. Append the result of running escape a regexp string given part’s suffix to the end of result. + result.append(escape_a_regexp_string(part.suffix)); + + // 17. Append the result of running escape a regexp string given part’s prefix to the end of result. + result.append(escape_a_regexp_string(part.prefix)); + + // 18. Append "(?:" to the end of result. + result.append("(?:"sv); + + // 19. Append regexp value to the end of result. + result.append(regexp_value); + + // 20. Append "))*)" to the end of result. + result.append("))*)"sv); + + // 21. Append the result of running escape a regexp string given part’s suffix to the end of result. + result.append(escape_a_regexp_string(part.suffix)); + + // 22. Append ")" to the end of result. + result.append(')'); + + // 23. If part’s modifier is "zero-or-more" then append "?" to the end of result. + if (part.modifier == Part::Modifier::ZeroOrMore) + result.append('?'); + } + + // 4. Append "$" to the end of result. + result.append('$'); + + // 5. Return (result, name list). + return { result.to_string_without_validation(), move(name_list) }; +} + +// https://urlpattern.spec.whatwg.org/#compile-a-component +PatternErrorOr Component::compile(Utf8View const& input, PatternParser::EncodingCallback encoding_callback, Options const& options) +{ + // 1. Let part list be the result of running parse a pattern string given input, options, and encoding callback. + auto part_list = TRY(PatternParser::parse(input, options, move(encoding_callback))); + + // 2. Let (regular expression string, name list) be the result of running generate a regular expression and name + // list given part list and options. + auto [regular_expression_string, name_list] = generate_a_regular_expression_and_name_list(part_list, options); + + // 3. Let flags be an empty string. + // NOTE: These flags match the flags for the empty string of the LibJS RegExp implementation. + auto flags = regex::RegexOptions { + (regex::ECMAScriptFlags)regex::AllFlags::SingleMatch + | (regex::ECMAScriptFlags)regex::AllFlags::Global + | (regex::ECMAScriptFlags)regex::AllFlags::SkipTrimEmptyMatches + | regex::ECMAScriptFlags::BrowserExtended + }; + + // 4. If options’s ignore case is true then set flags to "vi". + if (options.ignore_case) { + flags |= regex::ECMAScriptFlags::UnicodeSets; + flags |= regex::ECMAScriptFlags::Insensitive; + } + // 5. Otherwise set flags to "v" + else { + flags |= regex::ECMAScriptFlags::UnicodeSets; + } + + // 6. Let regular expression be RegExpCreate(regular expression string, flags). If this throws an exception, catch + // it, and throw a TypeError. + auto regex = make>(regular_expression_string.to_byte_string(), flags); + if (regex->parser_result.error != regex::Error::NoError) + return ErrorInfo { MUST(String::formatted("RegExp compile error: {}", regex->error_string())) }; + + // 7. Let pattern string be the result of running generate a pattern string given part list and options. + auto pattern_string = generate_a_pattern_string(part_list, options); + + // 8. Let has regexp groups be false. + bool has_regexp_groups = false; + + // 9. For each part of part list: + for (auto const& part : part_list) { + // 1. If part’s type is "regexp", then set has regexp groups to true. + if (part.type == Part::Type::Regexp) { + has_regexp_groups = true; + break; + } + } + + // 10. Return a new component whose pattern string is pattern string, regular expression is regular expression, + // group name list is name list, and has regexp groups is has regexp groups. + return Component { + .pattern_string = move(pattern_string), + .regular_expression = move(regex), + .group_name_list = move(name_list), + .has_regexp_groups = has_regexp_groups, + }; +} + +} diff --git a/Libraries/LibURL/Pattern/Component.h b/Libraries/LibURL/Pattern/Component.h index eea59a101cd..5d7ec4dd017 100644 --- a/Libraries/LibURL/Pattern/Component.h +++ b/Libraries/LibURL/Pattern/Component.h @@ -9,6 +9,7 @@ #include #include #include +#include namespace URL::Pattern { @@ -29,6 +30,8 @@ struct Component { // https://urlpattern.spec.whatwg.org/#component-has-regexp-groups // has regexp groups, a boolean bool has_regexp_groups {}; + + static PatternErrorOr compile(Utf8View const& input, PatternParser::EncodingCallback, Options const&); }; }