LibTextCodec: Replace unmatched utf16 surrogates

Author: https://github.com/Gingeh Commit: f098bd029c Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5266 Reviewed-by: https://github.com/shannonbooth Reviewed-by: https://github.com/trflynn89
2025-08-08 09:09:43 +00:00 · 2025-07-01 23:20:28 +10:00 · 2025-07-01 23:20:28 +10:00 · f098bd029c · 2025-07-05 14:00:10 +00:00
commit f098bd029c
parent 1b8a77f98c
6 changed files with 102 additions and 10 deletions
--- a/AK/String.cpp
+++ b/AK/String.cpp
@ -66,15 +66,23 @@ ErrorOr<String> String::from_utf16(Utf16View const& utf16)
    return utf16.to_utf8();
 }
-ErrorOr<String> String::from_utf16_le(ReadonlyBytes bytes)
+ErrorOr<String> String::from_utf16_le_with_replacement_character(ReadonlyBytes bytes)
 {
    if (!validate_utf16_le(bytes))
        return Error::from_string_literal("String::from_utf16_le: Input was not valid UTF-16LE");
    if (bytes.is_empty())
        return String {};
    auto const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
    auto utf16_length = bytes.size() / 2;
    Utf16Data well_formed_utf16;
    if (!validate_utf16_le(bytes)) {
        well_formed_utf16.resize(bytes.size());
        simdutf::to_well_formed_utf16le(utf16_data, utf16_length, well_formed_utf16.data());
        utf16_data = well_formed_utf16.data();
    }
    auto utf8_length = simdutf::utf8_length_from_utf16le(utf16_data, utf16_length);
    String result;
@ -87,15 +95,23 @@ ErrorOr<String> String::from_utf16_le(ReadonlyBytes bytes)
    return result;
 }
-ErrorOr<String> String::from_utf16_be(ReadonlyBytes bytes)
+ErrorOr<String> String::from_utf16_be_with_replacement_character(ReadonlyBytes bytes)
 {
    if (!validate_utf16_be(bytes))
        return Error::from_string_literal("String::from_utf16_be: Input was not valid UTF-16BE");
    if (bytes.is_empty())
        return String {};
    auto const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
    auto utf16_length = bytes.size() / 2;
    Utf16Data well_formed_utf16;
    if (!validate_utf16_le(bytes)) {
        well_formed_utf16.resize(bytes.size());
        simdutf::to_well_formed_utf16be(utf16_data, utf16_length, well_formed_utf16.data());
        utf16_data = well_formed_utf16.data();
    }
    auto utf8_length = simdutf::utf8_length_from_utf16be(utf16_data, utf16_length);
    String result;
--- a/AK/String.h
+++ b/AK/String.h
@ -70,8 +70,8 @@ public:
    // Creates a new String from a sequence of UTF-16 encoded code points.
    static ErrorOr<String> from_utf16(Utf16View const&);
-    static ErrorOr<String> from_utf16_le(ReadonlyBytes);
+    static ErrorOr<String> from_utf16_le_with_replacement_character(ReadonlyBytes);
-    static ErrorOr<String> from_utf16_be(ReadonlyBytes);
+    static ErrorOr<String> from_utf16_be_with_replacement_character(ReadonlyBytes);
    // Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream.
    static ErrorOr<String> from_stream(Stream&, size_t byte_count);
--- a/Libraries/LibTextCodec/Decoder.cpp
+++ b/Libraries/LibTextCodec/Decoder.cpp
@ -381,7 +381,7 @@ ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
    if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
        input = input.substring_view(2);
-    return String::from_utf16_be(input.bytes());
+    return String::from_utf16_be_with_replacement_character(input.bytes());
 }
 bool UTF16LEDecoder::validate(StringView input)
@ -395,7 +395,7 @@ ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
    if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
        input = input.substring_view(2);
-    return String::from_utf16_le(input.bytes());
+    return String::from_utf16_le_with_replacement_character(input.bytes());
 }
 ErrorOr<void> Latin1Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
--- a/Tests/LibWeb/Text/expected/wpt-import/encoding/textdecoder-utf16-surrogates.any.txt
+++ b/Tests/LibWeb/Text/expected/wpt-import/encoding/textdecoder-utf16-surrogates.any.txt
@ -0,0 +1,15 @@
 Harness status: OK
 Found 10 tests
 10 Pass
 Pass	utf-16le - lone surrogate lead
 Pass	utf-16le - lone surrogate lead (fatal flag set)
 Pass	utf-16le - lone surrogate trail
 Pass	utf-16le - lone surrogate trail (fatal flag set)
 Pass	utf-16le - unmatched surrogate lead
 Pass	utf-16le - unmatched surrogate lead (fatal flag set)
 Pass	utf-16le - unmatched surrogate trail
 Pass	utf-16le - unmatched surrogate trail (fatal flag set)
 Pass	utf-16le - swapped surrogate pair
 Pass	utf-16le - swapped surrogate pair (fatal flag set)
--- a/Tests/LibWeb/Text/input/wpt-import/encoding/textdecoder-utf16-surrogates.any.html
+++ b/Tests/LibWeb/Text/input/wpt-import/encoding/textdecoder-utf16-surrogates.any.html
@ -0,0 +1,15 @@
 <!doctype html>
 <meta charset=utf-8>
 <title>Encoding API: UTF-16 surrogate handling</title>
 <script>
 self.GLOBAL = {
  isWindow: function() { return true; },
  isWorker: function() { return false; },
  isShadowRealm: function() { return false; },
 };
 </script>
 <script src="../resources/testharness.js"></script>
 <script src="../resources/testharnessreport.js"></script>
 <div id=log></div>
 <script src="../encoding/textdecoder-utf16-surrogates.any.js"></script>
--- a/Tests/LibWeb/Text/input/wpt-import/encoding/textdecoder-utf16-surrogates.any.js
+++ b/Tests/LibWeb/Text/input/wpt-import/encoding/textdecoder-utf16-surrogates.any.js
@ -0,0 +1,46 @@
 // META: global=window,dedicatedworker,shadowrealm
 // META: title=Encoding API: UTF-16 surrogate handling
 var bad = [
    {
        encoding: 'utf-16le',
        input: [0x00, 0xd8],
        expected: '\uFFFD',
        name: 'lone surrogate lead'
    },
    {
        encoding: 'utf-16le',
        input: [0x00, 0xdc],
        expected: '\uFFFD',
        name: 'lone surrogate trail'
    },
    {
        encoding: 'utf-16le',
        input: [0x00, 0xd8, 0x00, 0x00],
        expected: '\uFFFD\u0000',
        name: 'unmatched surrogate lead'
    },
    {
        encoding: 'utf-16le',
        input: [0x00, 0xdc, 0x00, 0x00],
        expected: '\uFFFD\u0000',
        name: 'unmatched surrogate trail'
    },
    {
        encoding: 'utf-16le',
        input: [0x00, 0xdc, 0x00, 0xd8],
        expected: '\uFFFD\uFFFD',
        name: 'swapped surrogate pair'
    }
 ];
 bad.forEach(function(t) {
    test(function() {
        assert_equals(new TextDecoder(t.encoding).decode(new Uint8Array(t.input)), t.expected);
    }, t.encoding + ' - ' + t.name);
    test(function() {
        assert_throws_js(TypeError, function() {
            new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
        });
    }, t.encoding + ' - ' + t.name + ' (fatal flag set)');
 });