LibTextCodec: Replace unmatched utf16 surrogates

This commit is contained in:
Gingeh 2025-07-01 23:20:28 +10:00 committed by Tim Flynn
commit f098bd029c
Notes: github-actions[bot] 2025-07-05 14:00:10 +00:00
6 changed files with 102 additions and 10 deletions

View file

@ -66,15 +66,23 @@ ErrorOr<String> String::from_utf16(Utf16View const& utf16)
return utf16.to_utf8(); return utf16.to_utf8();
} }
ErrorOr<String> String::from_utf16_le(ReadonlyBytes bytes) ErrorOr<String> String::from_utf16_le_with_replacement_character(ReadonlyBytes bytes)
{ {
if (!validate_utf16_le(bytes))
return Error::from_string_literal("String::from_utf16_le: Input was not valid UTF-16LE");
if (bytes.is_empty()) if (bytes.is_empty())
return String {}; return String {};
auto const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data()); auto const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
auto utf16_length = bytes.size() / 2; auto utf16_length = bytes.size() / 2;
Utf16Data well_formed_utf16;
if (!validate_utf16_le(bytes)) {
well_formed_utf16.resize(bytes.size());
simdutf::to_well_formed_utf16le(utf16_data, utf16_length, well_formed_utf16.data());
utf16_data = well_formed_utf16.data();
}
auto utf8_length = simdutf::utf8_length_from_utf16le(utf16_data, utf16_length); auto utf8_length = simdutf::utf8_length_from_utf16le(utf16_data, utf16_length);
String result; String result;
@ -87,15 +95,23 @@ ErrorOr<String> String::from_utf16_le(ReadonlyBytes bytes)
return result; return result;
} }
ErrorOr<String> String::from_utf16_be(ReadonlyBytes bytes) ErrorOr<String> String::from_utf16_be_with_replacement_character(ReadonlyBytes bytes)
{ {
if (!validate_utf16_be(bytes))
return Error::from_string_literal("String::from_utf16_be: Input was not valid UTF-16BE");
if (bytes.is_empty()) if (bytes.is_empty())
return String {}; return String {};
auto const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data()); auto const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
auto utf16_length = bytes.size() / 2; auto utf16_length = bytes.size() / 2;
Utf16Data well_formed_utf16;
if (!validate_utf16_le(bytes)) {
well_formed_utf16.resize(bytes.size());
simdutf::to_well_formed_utf16be(utf16_data, utf16_length, well_formed_utf16.data());
utf16_data = well_formed_utf16.data();
}
auto utf8_length = simdutf::utf8_length_from_utf16be(utf16_data, utf16_length); auto utf8_length = simdutf::utf8_length_from_utf16be(utf16_data, utf16_length);
String result; String result;

View file

@ -70,8 +70,8 @@ public:
// Creates a new String from a sequence of UTF-16 encoded code points. // Creates a new String from a sequence of UTF-16 encoded code points.
static ErrorOr<String> from_utf16(Utf16View const&); static ErrorOr<String> from_utf16(Utf16View const&);
static ErrorOr<String> from_utf16_le(ReadonlyBytes); static ErrorOr<String> from_utf16_le_with_replacement_character(ReadonlyBytes);
static ErrorOr<String> from_utf16_be(ReadonlyBytes); static ErrorOr<String> from_utf16_be_with_replacement_character(ReadonlyBytes);
// Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream. // Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream.
static ErrorOr<String> from_stream(Stream&, size_t byte_count); static ErrorOr<String> from_stream(Stream&, size_t byte_count);

View file

@ -381,7 +381,7 @@ ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
input = input.substring_view(2); input = input.substring_view(2);
return String::from_utf16_be(input.bytes()); return String::from_utf16_be_with_replacement_character(input.bytes());
} }
bool UTF16LEDecoder::validate(StringView input) bool UTF16LEDecoder::validate(StringView input)
@ -395,7 +395,7 @@ ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE) if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
input = input.substring_view(2); input = input.substring_view(2);
return String::from_utf16_le(input.bytes()); return String::from_utf16_le_with_replacement_character(input.bytes());
} }
ErrorOr<void> Latin1Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point) ErrorOr<void> Latin1Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)

View file

@ -0,0 +1,15 @@
Harness status: OK
Found 10 tests
10 Pass
Pass utf-16le - lone surrogate lead
Pass utf-16le - lone surrogate lead (fatal flag set)
Pass utf-16le - lone surrogate trail
Pass utf-16le - lone surrogate trail (fatal flag set)
Pass utf-16le - unmatched surrogate lead
Pass utf-16le - unmatched surrogate lead (fatal flag set)
Pass utf-16le - unmatched surrogate trail
Pass utf-16le - unmatched surrogate trail (fatal flag set)
Pass utf-16le - swapped surrogate pair
Pass utf-16le - swapped surrogate pair (fatal flag set)

View file

@ -0,0 +1,15 @@
<!doctype html>
<meta charset=utf-8>
<title>Encoding API: UTF-16 surrogate handling</title>
<script>
self.GLOBAL = {
isWindow: function() { return true; },
isWorker: function() { return false; },
isShadowRealm: function() { return false; },
};
</script>
<script src="../resources/testharness.js"></script>
<script src="../resources/testharnessreport.js"></script>
<div id=log></div>
<script src="../encoding/textdecoder-utf16-surrogates.any.js"></script>

View file

@ -0,0 +1,46 @@
// META: global=window,dedicatedworker,shadowrealm
// META: title=Encoding API: UTF-16 surrogate handling
var bad = [
{
encoding: 'utf-16le',
input: [0x00, 0xd8],
expected: '\uFFFD',
name: 'lone surrogate lead'
},
{
encoding: 'utf-16le',
input: [0x00, 0xdc],
expected: '\uFFFD',
name: 'lone surrogate trail'
},
{
encoding: 'utf-16le',
input: [0x00, 0xd8, 0x00, 0x00],
expected: '\uFFFD\u0000',
name: 'unmatched surrogate lead'
},
{
encoding: 'utf-16le',
input: [0x00, 0xdc, 0x00, 0x00],
expected: '\uFFFD\u0000',
name: 'unmatched surrogate trail'
},
{
encoding: 'utf-16le',
input: [0x00, 0xdc, 0x00, 0xd8],
expected: '\uFFFD\uFFFD',
name: 'swapped surrogate pair'
}
];
bad.forEach(function(t) {
test(function() {
assert_equals(new TextDecoder(t.encoding).decode(new Uint8Array(t.input)), t.expected);
}, t.encoding + ' - ' + t.name);
test(function() {
assert_throws_js(TypeError, function() {
new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
});
}, t.encoding + ' - ' + t.name + ' (fatal flag set)');
});