AK+LibJS+LibWeb: Use simdutf to create well-formed strings

This commit is contained in:
Timothy Flynn 2025-07-25 15:37:06 -04:00 committed by Jelle Raaijmakers
commit 1375e6bf39
Notes: github-actions[bot] 2025-07-25 22:41:55 +00:00
9 changed files with 37 additions and 42 deletions

View file

@ -114,6 +114,20 @@ Utf16String Utf16String::repeated(u32 code_point, size_t count)
return builder.to_utf16_string();
}
Utf16String Utf16String::to_well_formed() const
{
if (utf16_view().validate(AllowLonelySurrogates::No))
return *this;
return Utf16String { Detail::Utf16StringData::to_well_formed(*this) };
}
String Utf16String::to_well_formed_utf8() const
{
if (utf16_view().validate(AllowLonelySurrogates::No))
return to_utf8(AllowLonelySurrogates::No);
return to_well_formed().to_utf8(AllowLonelySurrogates::No);
}
ErrorOr<void> Formatter<Utf16String>::format(FormatBuilder& builder, Utf16String const& utf16_string)
{
if (utf16_string.has_long_utf16_storage())

View file

@ -138,6 +138,9 @@ public:
return from_string_builder_without_validation(builder);
}
Utf16String to_well_formed() const;
String to_well_formed_utf8() const;
// These methods require linking LibUnicode.
Utf16String to_lowercase(Optional<StringView> const& locale = {}) const;
Utf16String to_uppercase(Optional<StringView> const& locale = {}) const;

View file

@ -158,6 +158,16 @@ NonnullRefPtr<Utf16StringData> Utf16StringData::from_string_builder(StringBuilde
return adopt_ref(*new (buffer->buffer.data()) Utf16StringData { storage_type, code_unit_length });
}
NonnullRefPtr<Utf16StringData> Utf16StringData::to_well_formed(Utf16View const& utf16_string)
{
VERIFY(!utf16_string.has_ascii_storage());
auto string = create_uninitialized(StorageType::UTF16, utf16_string.length_in_code_units());
simdutf::to_well_formed_utf16(utf16_string.utf16_span().data(), utf16_string.length_in_code_units(), string->m_utf16_data);
return string;
}
size_t Utf16StringData::calculate_code_point_length() const
{
ASSERT(!has_ascii_storage());

View file

@ -35,6 +35,8 @@ public:
static NonnullRefPtr<Utf16StringData> from_utf32(Utf32View const&);
static NonnullRefPtr<Utf16StringData> from_string_builder(StringBuilder&);
static NonnullRefPtr<Utf16StringData> to_well_formed(Utf16View const&);
~Utf16StringData()
{
if (is_fly_string())

View file

@ -1362,44 +1362,17 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::to_well_formed)
// 2. Let S be ? ToString(O).
auto string = TRY(primitive_string_from(vm));
// NOTE: Rest of steps in to_well_formed below
return PrimitiveString::create(vm, to_well_formed_string(string->utf16_string()));
}
// https://tc39.es/ecma262/#sec-string.prototype.towellformed
String to_well_formed_string(Utf16String const& string)
{
// 3. Let strLen be the length of S.
auto length = string.length_in_code_units();
// 4. Let k be 0.
size_t k = 0;
// 5. Let result be the empty String.
StringBuilder result;
// 6. Repeat, while k < strLen,
while (k < length) {
// a. Let cp be CodePointAt(S, k).
auto code_point = JS::code_point_at(string, k);
// b. If cp.[[IsUnpairedSurrogate]] is true, then
if (code_point.is_unpaired_surrogate) {
// i. Set result to the string-concatenation of result and 0xFFFD (REPLACEMENT CHARACTER).
result.append_code_point(0xfffd);
}
// c. Else,
else {
// i. Set result to the string-concatenation of result and UTF16EncodeCodePoint(cp.[[CodePoint]]).
result.append_code_point(code_point.code_point);
}
// d. Set k to k + cp.[[CodeUnitCount]].
k += code_point.code_unit_count;
}
// 7. Return result.
return MUST(result.to_string());
// a. Let cp be CodePointAt(S, k).
// b. If cp.[[IsUnpairedSurrogate]] is true, then
// i. Set result to the string-concatenation of result and 0xFFFD (REPLACEMENT CHARACTER).
// c. Else,
// i. Set result to the string-concatenation of result and UTF16EncodeCodePoint(cp.[[CodePoint]]).
// d. Set k to k + cp.[[CodeUnitCount]].
return PrimitiveString::create(vm, string->utf16_string().to_well_formed());
}
// 22.1.3.32.1 TrimString ( string, where ), https://tc39.es/ecma262/#sec-trimstring

View file

@ -19,7 +19,6 @@ struct CodePoint {
Optional<size_t> string_index_of(Utf16View const& string, Utf16View const& search_value, size_t from_index);
CodePoint code_point_at(Utf16View const& string, size_t position);
String to_well_formed_string(Utf16String const&);
static constexpr Utf8View whitespace_characters = Utf8View("\x09\x0A\x0B\x0C\x0D\x20\xC2\xA0\xE1\x9A\x80\xE2\x80\x80\xE2\x80\x81\xE2\x80\x82\xE2\x80\x83\xE2\x80\x84\xE2\x80\x85\xE2\x80\x86\xE2\x80\x87\xE2\x80\x88\xE2\x80\x89\xE2\x80\x8A\xE2\x80\xAF\xE2\x81\x9F\xE3\x80\x80\xE2\x80\xA8\xE2\x80\xA9\xEF\xBB\xBF"sv);
ThrowCompletionOr<String> trim_string(VM&, Value string, TrimMode where);

View file

@ -458,11 +458,6 @@ ThrowCompletionOr<Utf16String> Value::to_utf16_string(VM& vm) const
return Utf16String::from_utf8(utf8_string);
}
ThrowCompletionOr<String> Value::to_well_formed_string(VM& vm) const
{
return ::JS::to_well_formed_string(TRY(to_utf16_string(vm)));
}
// 7.1.2 ToBoolean ( argument ), https://tc39.es/ecma262/#sec-toboolean
bool Value::to_boolean_slow_case() const
{

View file

@ -352,7 +352,6 @@ public:
ThrowCompletionOr<String> to_string(VM&) const;
ThrowCompletionOr<ByteString> to_byte_string(VM&) const;
ThrowCompletionOr<Utf16String> to_utf16_string(VM&) const;
ThrowCompletionOr<String> to_well_formed_string(VM&) const;
ThrowCompletionOr<GC::Ref<PrimitiveString>> to_primitive_string(VM&);
ThrowCompletionOr<Value> to_primitive(VM&, PreferredType preferred_type = PreferredType::Default) const;
ThrowCompletionOr<GC::Ref<Object>> to_object(VM&) const;

View file

@ -241,7 +241,7 @@ JS::ThrowCompletionOr<Utf16String> to_utf16_string(JS::VM& vm, JS::Value value)
JS::ThrowCompletionOr<String> to_usv_string(JS::VM& vm, JS::Value value)
{
return value.to_well_formed_string(vm);
return TRY(value.to_utf16_string(vm)).to_well_formed_utf8();
}
// https://webidl.spec.whatwg.org/#invoke-a-callback-function