AK+LibURL+LibWeb: Use simdutf to validate ASCII strings

simdutf provides a vectorized ASCII validator, so let's use that instead of looping over strings manually.
2025-04-21 12:05:15 +00:00 · 2025-04-06 08:39:05 -04:00 · 2025-04-06 08:39:05 -04:00 · ed3a677f08
commit ed3a677f08
parent 212095e1c2
10 changed files with 32 additions and 11 deletions
--- a/AK/String.h
+++ b/AK/String.h
@ -112,6 +112,7 @@ public:

    [[nodiscard]] String to_ascii_lowercase() const;
    [[nodiscard]] String to_ascii_uppercase() const;
+    [[nodiscard]] bool is_ascii() const { return bytes_as_string_view().is_ascii(); }

    // Compare this String against another string with caseless matching. Using this method requires linking LibUnicode into your application.
    [[nodiscard]] bool equals_ignoring_case(String const&) const;
--- a/AK/StringView.cpp
+++ b/AK/StringView.cpp
@ -15,6 +15,8 @@
 #include <AK/StringView.h>
 #include <AK/Vector.h>

+#include <simdutf.h>
+
 namespace AK {

 StringView::StringView(String const& string)
@ -195,6 +197,13 @@ bool StringView::equals_ignoring_ascii_case(StringView other) const
    return StringUtils::equals_ignoring_ascii_case(*this, other);
 }

+bool StringView::is_ascii() const
+{
+    if (is_empty())
+        return true;
+    return simdutf::validate_ascii(characters_without_null_termination(), length());
+}
+
 ByteString StringView::to_lowercase_string() const
 {
    return StringImpl::create_lowercased(characters_without_null_termination(), length()).release_nonnull();
--- a/AK/StringView.h
+++ b/AK/StringView.h
@ -100,6 +100,7 @@ public:
    [[nodiscard]] bool contains(u32) const;
    [[nodiscard]] bool contains(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive) const;
    [[nodiscard]] bool equals_ignoring_ascii_case(StringView) const;
+    [[nodiscard]] bool is_ascii() const;

    [[nodiscard]] StringView trim(StringView characters, TrimMode mode = TrimMode::Both) const { return StringUtils::trim(*this, characters, mode); }
    [[nodiscard]] StringView trim_whitespace(TrimMode mode = TrimMode::Both) const { return StringUtils::trim_whitespace(*this, mode); }
--- a/Libraries/LibURL/Host.cpp
+++ b/Libraries/LibURL/Host.cpp
@ -196,7 +196,7 @@ Optional<String> Host::public_suffix() const
    auto public_suffix = get_public_suffix(host_string.bytes_as_string_view()).value_or("*"_string);

    // 4. Assert: publicSuffix is an ASCII string that does not end with ".".
-    VERIFY(all_of(public_suffix.code_points(), is_ascii));
+    VERIFY(public_suffix.is_ascii());
    VERIFY(!public_suffix.ends_with('.'));

    // 5. Return publicSuffix and trailingDot concatenated.
@ -223,7 +223,7 @@ Optional<String> Host::registrable_domain() const
    auto registrable_domain = get_registrable_domain(host_string).value_or("*"_string);

    // 4. Assert: registrableDomain is an ASCII string that does not end with ".".
-    VERIFY(all_of(registrable_domain.code_points(), is_ascii));
+    VERIFY(registrable_domain.is_ascii());
    VERIFY(!registrable_domain.ends_with('.'));

    // 5. Return registrableDomain and trailingDot concatenated.
--- a/Libraries/LibURL/Parser.cpp
+++ b/Libraries/LibURL/Parser.cpp
@ -514,7 +514,7 @@ static ErrorOr<String> domain_to_ascii(StringView domain, bool be_strict)
    // OPTIMIZATION: If beStrict is false, domain is an ASCII string, and strictly splitting domain on U+002E (.)
    //               does not produce any item that starts with an ASCII case-insensitive match for "xn--", this
    //               step is equivalent to ASCII lowercasing domain.
-    if (!be_strict && all_of(domain, is_ascii)) {
+    if (!be_strict && domain.is_ascii()) {
        // 3. If result is the empty string, domain-to-ASCII validation error, return failure.
        if (domain.is_empty())
            return Error::from_string_literal("Empty domain");
--- a/Libraries/LibURL/Pattern/String.cpp
+++ b/Libraries/LibURL/Pattern/String.cpp
@ -14,7 +14,7 @@ namespace URL::Pattern {
 String escape_a_pattern_string(String const& input)
 {
    // 1. Assert: input is an ASCII string.
-    VERIFY(all_of(input.code_points(), is_ascii));
+    VERIFY(input.is_ascii());

    // 2. Let result be the empty string.
    StringBuilder result;
@ -51,7 +51,7 @@ String escape_a_pattern_string(String const& input)
 String escape_a_regexp_string(String const& input)
 {
    // 1. Assert: input is an ASCII string.
-    VERIFY(all_of(input.code_points(), is_ascii));
+    VERIFY(input.is_ascii());

    // 2. Let result be the empty string.
    StringBuilder builder;
--- a/Libraries/LibWeb/ContentSecurityPolicy/Policy.cpp
+++ b/Libraries/LibWeb/ContentSecurityPolicy/Policy.cpp
@ -47,7 +47,7 @@ GC::Ref<Policy> Policy::parse_a_serialized_csp(JS::Realm& realm, Variant<ByteBuf
        auto stripped_token_view = stripped_token.bytes_as_string_view();

        // 2. If token is an empty string, or if token is not an ASCII string, continue.
-        if (stripped_token.is_empty() || !all_of(stripped_token_view, is_ascii))
+        if (stripped_token.is_empty() || !stripped_token_view.is_ascii())
            continue;

        // 3. Let directive name be the result of collecting a sequence of code points from token which are not
--- a/Libraries/LibWeb/Fetch/Body.cpp
+++ b/Libraries/LibWeb/Fetch/Body.cpp
@ -436,7 +436,7 @@ MultipartParsingErrorOr<Vector<XHR::FormDataEntry>> parse_multipart_form_data(JS
                header.content_type = "text/plain"_string;

            // 2. If contentType is not an ASCII string, set contentType to the empty string.
-            if (!all_of(header.content_type->code_points(), is_ascii)) {
+            if (!header.content_type->is_ascii()) {
                header.content_type = ""_string;
            }

--- a/Libraries/LibWebView/CookieJar.cpp
+++ b/Libraries/LibWebView/CookieJar.cpp
@ -333,10 +333,8 @@ void CookieJar::store_cookie(Web::Cookie::ParsedCookie const& parsed_cookie, con

    // 8. If the domain-attribute contains a character that is not in the range of [USASCII] characters, abort these
    //    steps and ignore the cookie entirely.
-    for (auto code_point : domain_attribute.code_points()) {
-        if (!is_ascii(code_point))
-            return;
-    }
+    if (!domain_attribute.is_ascii())
+        return;

    // 9. If the user agent is configured to reject "public suffixes" and the domain-attribute is a public suffix:
    if (URL::is_public_suffix(domain_attribute)) {
--- a/Tests/AK/TestString.cpp
+++ b/Tests/AK/TestString.cpp
@ -1516,3 +1516,15 @@ TEST_CASE(to_ascii_uppercase)
    auto uppercased = long_string.to_ascii_uppercase();
    EXPECT_EQ(long_string.bytes().data(), uppercased.bytes().data());
 }
+
+TEST_CASE(is_ascii)
+{
+    EXPECT(String {}.is_ascii());
+    EXPECT(" "_string.is_ascii());
+    EXPECT("abc"_string.is_ascii());
+    EXPECT("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()"_string.is_ascii());
+
+    EXPECT(!"€"_string.is_ascii());
+    EXPECT(!"😀"_string.is_ascii());
+    EXPECT(!"abcdefghijklmnopqrstuvwxyz😀ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789😀!@#$%^&*()"_string.is_ascii());
+}