diff --git a/AK/Utf16String.cpp b/AK/Utf16String.cpp index a1d42f37d2d..4accbcbb905 100644 --- a/AK/Utf16String.cpp +++ b/AK/Utf16String.cpp @@ -114,6 +114,20 @@ Utf16String Utf16String::repeated(u32 code_point, size_t count) return builder.to_utf16_string(); } +Utf16String Utf16String::to_well_formed() const +{ + if (utf16_view().validate(AllowLonelySurrogates::No)) + return *this; + return Utf16String { Detail::Utf16StringData::to_well_formed(*this) }; +} + +String Utf16String::to_well_formed_utf8() const +{ + if (utf16_view().validate(AllowLonelySurrogates::No)) + return to_utf8(AllowLonelySurrogates::No); + return to_well_formed().to_utf8(AllowLonelySurrogates::No); +} + ErrorOr Formatter::format(FormatBuilder& builder, Utf16String const& utf16_string) { if (utf16_string.has_long_utf16_storage()) diff --git a/AK/Utf16String.h b/AK/Utf16String.h index f8c33549dbc..6b7644691a0 100644 --- a/AK/Utf16String.h +++ b/AK/Utf16String.h @@ -138,6 +138,9 @@ public: return from_string_builder_without_validation(builder); } + Utf16String to_well_formed() const; + String to_well_formed_utf8() const; + // These methods require linking LibUnicode. Utf16String to_lowercase(Optional const& locale = {}) const; Utf16String to_uppercase(Optional const& locale = {}) const; diff --git a/AK/Utf16StringData.cpp b/AK/Utf16StringData.cpp index 8066ab16786..437551d57a9 100644 --- a/AK/Utf16StringData.cpp +++ b/AK/Utf16StringData.cpp @@ -158,6 +158,16 @@ NonnullRefPtr Utf16StringData::from_string_builder(StringBuilde return adopt_ref(*new (buffer->buffer.data()) Utf16StringData { storage_type, code_unit_length }); } +NonnullRefPtr Utf16StringData::to_well_formed(Utf16View const& utf16_string) +{ + VERIFY(!utf16_string.has_ascii_storage()); + + auto string = create_uninitialized(StorageType::UTF16, utf16_string.length_in_code_units()); + simdutf::to_well_formed_utf16(utf16_string.utf16_span().data(), utf16_string.length_in_code_units(), string->m_utf16_data); + + return string; +} + size_t Utf16StringData::calculate_code_point_length() const { ASSERT(!has_ascii_storage()); diff --git a/AK/Utf16StringData.h b/AK/Utf16StringData.h index 18a2458c37e..85924b0aec1 100644 --- a/AK/Utf16StringData.h +++ b/AK/Utf16StringData.h @@ -35,6 +35,8 @@ public: static NonnullRefPtr from_utf32(Utf32View const&); static NonnullRefPtr from_string_builder(StringBuilder&); + static NonnullRefPtr to_well_formed(Utf16View const&); + ~Utf16StringData() { if (is_fly_string()) diff --git a/Libraries/LibJS/Runtime/StringPrototype.cpp b/Libraries/LibJS/Runtime/StringPrototype.cpp index 8c149f7a562..72ffb9ce90a 100644 --- a/Libraries/LibJS/Runtime/StringPrototype.cpp +++ b/Libraries/LibJS/Runtime/StringPrototype.cpp @@ -1362,44 +1362,17 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::to_well_formed) // 2. Let S be ? ToString(O). auto string = TRY(primitive_string_from(vm)); - // NOTE: Rest of steps in to_well_formed below - return PrimitiveString::create(vm, to_well_formed_string(string->utf16_string())); -} - -// https://tc39.es/ecma262/#sec-string.prototype.towellformed -String to_well_formed_string(Utf16String const& string) -{ // 3. Let strLen be the length of S. - auto length = string.length_in_code_units(); - // 4. Let k be 0. - size_t k = 0; - // 5. Let result be the empty String. - StringBuilder result; - // 6. Repeat, while k < strLen, - while (k < length) { - // a. Let cp be CodePointAt(S, k). - auto code_point = JS::code_point_at(string, k); - - // b. If cp.[[IsUnpairedSurrogate]] is true, then - if (code_point.is_unpaired_surrogate) { - // i. Set result to the string-concatenation of result and 0xFFFD (REPLACEMENT CHARACTER). - result.append_code_point(0xfffd); - } - // c. Else, - else { - // i. Set result to the string-concatenation of result and UTF16EncodeCodePoint(cp.[[CodePoint]]). - result.append_code_point(code_point.code_point); - } - - // d. Set k to k + cp.[[CodeUnitCount]]. - k += code_point.code_unit_count; - } - - // 7. Return result. - return MUST(result.to_string()); + // a. Let cp be CodePointAt(S, k). + // b. If cp.[[IsUnpairedSurrogate]] is true, then + // i. Set result to the string-concatenation of result and 0xFFFD (REPLACEMENT CHARACTER). + // c. Else, + // i. Set result to the string-concatenation of result and UTF16EncodeCodePoint(cp.[[CodePoint]]). + // d. Set k to k + cp.[[CodeUnitCount]]. + return PrimitiveString::create(vm, string->utf16_string().to_well_formed()); } // 22.1.3.32.1 TrimString ( string, where ), https://tc39.es/ecma262/#sec-trimstring diff --git a/Libraries/LibJS/Runtime/StringPrototype.h b/Libraries/LibJS/Runtime/StringPrototype.h index b8f1375eacc..2308ea6d3c4 100644 --- a/Libraries/LibJS/Runtime/StringPrototype.h +++ b/Libraries/LibJS/Runtime/StringPrototype.h @@ -19,7 +19,6 @@ struct CodePoint { Optional string_index_of(Utf16View const& string, Utf16View const& search_value, size_t from_index); CodePoint code_point_at(Utf16View const& string, size_t position); -String to_well_formed_string(Utf16String const&); static constexpr Utf8View whitespace_characters = Utf8View("\x09\x0A\x0B\x0C\x0D\x20\xC2\xA0\xE1\x9A\x80\xE2\x80\x80\xE2\x80\x81\xE2\x80\x82\xE2\x80\x83\xE2\x80\x84\xE2\x80\x85\xE2\x80\x86\xE2\x80\x87\xE2\x80\x88\xE2\x80\x89\xE2\x80\x8A\xE2\x80\xAF\xE2\x81\x9F\xE3\x80\x80\xE2\x80\xA8\xE2\x80\xA9\xEF\xBB\xBF"sv); ThrowCompletionOr trim_string(VM&, Value string, TrimMode where); diff --git a/Libraries/LibJS/Runtime/Value.cpp b/Libraries/LibJS/Runtime/Value.cpp index 207b2ea6ce6..d7dc9de2f54 100644 --- a/Libraries/LibJS/Runtime/Value.cpp +++ b/Libraries/LibJS/Runtime/Value.cpp @@ -458,11 +458,6 @@ ThrowCompletionOr Value::to_utf16_string(VM& vm) const return Utf16String::from_utf8(utf8_string); } -ThrowCompletionOr Value::to_well_formed_string(VM& vm) const -{ - return ::JS::to_well_formed_string(TRY(to_utf16_string(vm))); -} - // 7.1.2 ToBoolean ( argument ), https://tc39.es/ecma262/#sec-toboolean bool Value::to_boolean_slow_case() const { diff --git a/Libraries/LibJS/Runtime/Value.h b/Libraries/LibJS/Runtime/Value.h index 9bd32de16e1..bb856079a64 100644 --- a/Libraries/LibJS/Runtime/Value.h +++ b/Libraries/LibJS/Runtime/Value.h @@ -352,7 +352,6 @@ public: ThrowCompletionOr to_string(VM&) const; ThrowCompletionOr to_byte_string(VM&) const; ThrowCompletionOr to_utf16_string(VM&) const; - ThrowCompletionOr to_well_formed_string(VM&) const; ThrowCompletionOr> to_primitive_string(VM&); ThrowCompletionOr to_primitive(VM&, PreferredType preferred_type = PreferredType::Default) const; ThrowCompletionOr> to_object(VM&) const; diff --git a/Libraries/LibWeb/WebIDL/AbstractOperations.cpp b/Libraries/LibWeb/WebIDL/AbstractOperations.cpp index 6afd7d9bc16..365f18922fe 100644 --- a/Libraries/LibWeb/WebIDL/AbstractOperations.cpp +++ b/Libraries/LibWeb/WebIDL/AbstractOperations.cpp @@ -241,7 +241,7 @@ JS::ThrowCompletionOr to_utf16_string(JS::VM& vm, JS::Value value) JS::ThrowCompletionOr to_usv_string(JS::VM& vm, JS::Value value) { - return value.to_well_formed_string(vm); + return TRY(value.to_utf16_string(vm)).to_well_formed_utf8(); } // https://webidl.spec.whatwg.org/#invoke-a-callback-function