diff --git a/AK/String.cpp b/AK/String.cpp index 9c3ae084eeb..52cfe7e3716 100644 --- a/AK/String.cpp +++ b/AK/String.cpp @@ -66,15 +66,23 @@ ErrorOr String::from_utf16(Utf16View const& utf16) return utf16.to_utf8(); } -ErrorOr String::from_utf16_le(ReadonlyBytes bytes) +ErrorOr String::from_utf16_le_with_replacement_character(ReadonlyBytes bytes) { - if (!validate_utf16_le(bytes)) - return Error::from_string_literal("String::from_utf16_le: Input was not valid UTF-16LE"); if (bytes.is_empty()) return String {}; auto const* utf16_data = reinterpret_cast(bytes.data()); auto utf16_length = bytes.size() / 2; + + Utf16Data well_formed_utf16; + + if (!validate_utf16_le(bytes)) { + well_formed_utf16.resize(bytes.size()); + + simdutf::to_well_formed_utf16le(utf16_data, utf16_length, well_formed_utf16.data()); + utf16_data = well_formed_utf16.data(); + } + auto utf8_length = simdutf::utf8_length_from_utf16le(utf16_data, utf16_length); String result; @@ -87,15 +95,23 @@ ErrorOr String::from_utf16_le(ReadonlyBytes bytes) return result; } -ErrorOr String::from_utf16_be(ReadonlyBytes bytes) +ErrorOr String::from_utf16_be_with_replacement_character(ReadonlyBytes bytes) { - if (!validate_utf16_be(bytes)) - return Error::from_string_literal("String::from_utf16_be: Input was not valid UTF-16BE"); if (bytes.is_empty()) return String {}; auto const* utf16_data = reinterpret_cast(bytes.data()); auto utf16_length = bytes.size() / 2; + + Utf16Data well_formed_utf16; + + if (!validate_utf16_le(bytes)) { + well_formed_utf16.resize(bytes.size()); + + simdutf::to_well_formed_utf16be(utf16_data, utf16_length, well_formed_utf16.data()); + utf16_data = well_formed_utf16.data(); + } + auto utf8_length = simdutf::utf8_length_from_utf16be(utf16_data, utf16_length); String result; diff --git a/AK/String.h b/AK/String.h index c4d42fc9670..f30fbb6e405 100644 --- a/AK/String.h +++ b/AK/String.h @@ -70,8 +70,8 @@ public: // Creates a new String from a sequence of UTF-16 encoded code points. static ErrorOr from_utf16(Utf16View const&); - static ErrorOr from_utf16_le(ReadonlyBytes); - static ErrorOr from_utf16_be(ReadonlyBytes); + static ErrorOr from_utf16_le_with_replacement_character(ReadonlyBytes); + static ErrorOr from_utf16_be_with_replacement_character(ReadonlyBytes); // Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream. static ErrorOr from_stream(Stream&, size_t byte_count); diff --git a/Libraries/LibTextCodec/Decoder.cpp b/Libraries/LibTextCodec/Decoder.cpp index d2f72183bae..4ffefe3ab49 100644 --- a/Libraries/LibTextCodec/Decoder.cpp +++ b/Libraries/LibTextCodec/Decoder.cpp @@ -381,7 +381,7 @@ ErrorOr UTF16BEDecoder::to_utf8(StringView input) if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) input = input.substring_view(2); - return String::from_utf16_be(input.bytes()); + return String::from_utf16_be_with_replacement_character(input.bytes()); } bool UTF16LEDecoder::validate(StringView input) @@ -395,7 +395,7 @@ ErrorOr UTF16LEDecoder::to_utf8(StringView input) if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE) input = input.substring_view(2); - return String::from_utf16_le(input.bytes()); + return String::from_utf16_le_with_replacement_character(input.bytes()); } ErrorOr Latin1Decoder::process(StringView input, Function(u32)> on_code_point) diff --git a/Tests/LibWeb/Text/expected/wpt-import/encoding/textdecoder-utf16-surrogates.any.txt b/Tests/LibWeb/Text/expected/wpt-import/encoding/textdecoder-utf16-surrogates.any.txt new file mode 100644 index 00000000000..4e9fbb835ce --- /dev/null +++ b/Tests/LibWeb/Text/expected/wpt-import/encoding/textdecoder-utf16-surrogates.any.txt @@ -0,0 +1,15 @@ +Harness status: OK + +Found 10 tests + +10 Pass +Pass utf-16le - lone surrogate lead +Pass utf-16le - lone surrogate lead (fatal flag set) +Pass utf-16le - lone surrogate trail +Pass utf-16le - lone surrogate trail (fatal flag set) +Pass utf-16le - unmatched surrogate lead +Pass utf-16le - unmatched surrogate lead (fatal flag set) +Pass utf-16le - unmatched surrogate trail +Pass utf-16le - unmatched surrogate trail (fatal flag set) +Pass utf-16le - swapped surrogate pair +Pass utf-16le - swapped surrogate pair (fatal flag set) \ No newline at end of file diff --git a/Tests/LibWeb/Text/input/wpt-import/encoding/textdecoder-utf16-surrogates.any.html b/Tests/LibWeb/Text/input/wpt-import/encoding/textdecoder-utf16-surrogates.any.html new file mode 100644 index 00000000000..cbad02b25d2 --- /dev/null +++ b/Tests/LibWeb/Text/input/wpt-import/encoding/textdecoder-utf16-surrogates.any.html @@ -0,0 +1,15 @@ + + +Encoding API: UTF-16 surrogate handling + + + + +
+ diff --git a/Tests/LibWeb/Text/input/wpt-import/encoding/textdecoder-utf16-surrogates.any.js b/Tests/LibWeb/Text/input/wpt-import/encoding/textdecoder-utf16-surrogates.any.js new file mode 100644 index 00000000000..7e8322cd19c --- /dev/null +++ b/Tests/LibWeb/Text/input/wpt-import/encoding/textdecoder-utf16-surrogates.any.js @@ -0,0 +1,46 @@ +// META: global=window,dedicatedworker,shadowrealm +// META: title=Encoding API: UTF-16 surrogate handling + +var bad = [ + { + encoding: 'utf-16le', + input: [0x00, 0xd8], + expected: '\uFFFD', + name: 'lone surrogate lead' + }, + { + encoding: 'utf-16le', + input: [0x00, 0xdc], + expected: '\uFFFD', + name: 'lone surrogate trail' + }, + { + encoding: 'utf-16le', + input: [0x00, 0xd8, 0x00, 0x00], + expected: '\uFFFD\u0000', + name: 'unmatched surrogate lead' + }, + { + encoding: 'utf-16le', + input: [0x00, 0xdc, 0x00, 0x00], + expected: '\uFFFD\u0000', + name: 'unmatched surrogate trail' + }, + { + encoding: 'utf-16le', + input: [0x00, 0xdc, 0x00, 0xd8], + expected: '\uFFFD\uFFFD', + name: 'swapped surrogate pair' + } +]; + +bad.forEach(function(t) { + test(function() { + assert_equals(new TextDecoder(t.encoding).decode(new Uint8Array(t.input)), t.expected); + }, t.encoding + ' - ' + t.name); + test(function() { + assert_throws_js(TypeError, function() { + new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input)) + }); + }, t.encoding + ' - ' + t.name + ' (fatal flag set)'); +});