AK+LibURL+LibWeb: Use simdutf to validate ASCII strings

simdutf provides a vectorized ASCII validator, so let's use that instead
of looping over strings manually.
This commit is contained in:
Timothy Flynn 2025-04-06 08:39:05 -04:00
parent 212095e1c2
commit ed3a677f08
10 changed files with 32 additions and 11 deletions

View file

@ -112,6 +112,7 @@ public:
[[nodiscard]] String to_ascii_lowercase() const;
[[nodiscard]] String to_ascii_uppercase() const;
[[nodiscard]] bool is_ascii() const { return bytes_as_string_view().is_ascii(); }
// Compare this String against another string with caseless matching. Using this method requires linking LibUnicode into your application.
[[nodiscard]] bool equals_ignoring_case(String const&) const;

View file

@ -15,6 +15,8 @@
#include <AK/StringView.h>
#include <AK/Vector.h>
#include <simdutf.h>
namespace AK {
StringView::StringView(String const& string)
@ -195,6 +197,13 @@ bool StringView::equals_ignoring_ascii_case(StringView other) const
return StringUtils::equals_ignoring_ascii_case(*this, other);
}
bool StringView::is_ascii() const
{
if (is_empty())
return true;
return simdutf::validate_ascii(characters_without_null_termination(), length());
}
ByteString StringView::to_lowercase_string() const
{
return StringImpl::create_lowercased(characters_without_null_termination(), length()).release_nonnull();

View file

@ -100,6 +100,7 @@ public:
[[nodiscard]] bool contains(u32) const;
[[nodiscard]] bool contains(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive) const;
[[nodiscard]] bool equals_ignoring_ascii_case(StringView) const;
[[nodiscard]] bool is_ascii() const;
[[nodiscard]] StringView trim(StringView characters, TrimMode mode = TrimMode::Both) const { return StringUtils::trim(*this, characters, mode); }
[[nodiscard]] StringView trim_whitespace(TrimMode mode = TrimMode::Both) const { return StringUtils::trim_whitespace(*this, mode); }

View file

@ -196,7 +196,7 @@ Optional<String> Host::public_suffix() const
auto public_suffix = get_public_suffix(host_string.bytes_as_string_view()).value_or("*"_string);
// 4. Assert: publicSuffix is an ASCII string that does not end with ".".
VERIFY(all_of(public_suffix.code_points(), is_ascii));
VERIFY(public_suffix.is_ascii());
VERIFY(!public_suffix.ends_with('.'));
// 5. Return publicSuffix and trailingDot concatenated.
@ -223,7 +223,7 @@ Optional<String> Host::registrable_domain() const
auto registrable_domain = get_registrable_domain(host_string).value_or("*"_string);
// 4. Assert: registrableDomain is an ASCII string that does not end with ".".
VERIFY(all_of(registrable_domain.code_points(), is_ascii));
VERIFY(registrable_domain.is_ascii());
VERIFY(!registrable_domain.ends_with('.'));
// 5. Return registrableDomain and trailingDot concatenated.

View file

@ -514,7 +514,7 @@ static ErrorOr<String> domain_to_ascii(StringView domain, bool be_strict)
// OPTIMIZATION: If beStrict is false, domain is an ASCII string, and strictly splitting domain on U+002E (.)
// does not produce any item that starts with an ASCII case-insensitive match for "xn--", this
// step is equivalent to ASCII lowercasing domain.
if (!be_strict && all_of(domain, is_ascii)) {
if (!be_strict && domain.is_ascii()) {
// 3. If result is the empty string, domain-to-ASCII validation error, return failure.
if (domain.is_empty())
return Error::from_string_literal("Empty domain");

View file

@ -14,7 +14,7 @@ namespace URL::Pattern {
String escape_a_pattern_string(String const& input)
{
// 1. Assert: input is an ASCII string.
VERIFY(all_of(input.code_points(), is_ascii));
VERIFY(input.is_ascii());
// 2. Let result be the empty string.
StringBuilder result;
@ -51,7 +51,7 @@ String escape_a_pattern_string(String const& input)
String escape_a_regexp_string(String const& input)
{
// 1. Assert: input is an ASCII string.
VERIFY(all_of(input.code_points(), is_ascii));
VERIFY(input.is_ascii());
// 2. Let result be the empty string.
StringBuilder builder;

View file

@ -47,7 +47,7 @@ GC::Ref<Policy> Policy::parse_a_serialized_csp(JS::Realm& realm, Variant<ByteBuf
auto stripped_token_view = stripped_token.bytes_as_string_view();
// 2. If token is an empty string, or if token is not an ASCII string, continue.
if (stripped_token.is_empty() || !all_of(stripped_token_view, is_ascii))
if (stripped_token.is_empty() || !stripped_token_view.is_ascii())
continue;
// 3. Let directive name be the result of collecting a sequence of code points from token which are not

View file

@ -436,7 +436,7 @@ MultipartParsingErrorOr<Vector<XHR::FormDataEntry>> parse_multipart_form_data(JS
header.content_type = "text/plain"_string;
// 2. If contentType is not an ASCII string, set contentType to the empty string.
if (!all_of(header.content_type->code_points(), is_ascii)) {
if (!header.content_type->is_ascii()) {
header.content_type = ""_string;
}

View file

@ -333,10 +333,8 @@ void CookieJar::store_cookie(Web::Cookie::ParsedCookie const& parsed_cookie, con
// 8. If the domain-attribute contains a character that is not in the range of [USASCII] characters, abort these
// steps and ignore the cookie entirely.
for (auto code_point : domain_attribute.code_points()) {
if (!is_ascii(code_point))
return;
}
if (!domain_attribute.is_ascii())
return;
// 9. If the user agent is configured to reject "public suffixes" and the domain-attribute is a public suffix:
if (URL::is_public_suffix(domain_attribute)) {

View file

@ -1516,3 +1516,15 @@ TEST_CASE(to_ascii_uppercase)
auto uppercased = long_string.to_ascii_uppercase();
EXPECT_EQ(long_string.bytes().data(), uppercased.bytes().data());
}
TEST_CASE(is_ascii)
{
EXPECT(String {}.is_ascii());
EXPECT(" "_string.is_ascii());
EXPECT("abc"_string.is_ascii());
EXPECT("abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()"_string.is_ascii());
EXPECT(!""_string.is_ascii());
EXPECT(!"😀"_string.is_ascii());
EXPECT(!"abcdefghijklmnopqrstuvwxyz😀ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789😀!@#$%^&*()"_string.is_ascii());
}