mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-08-08 01:00:05 +00:00
LibTextCodec: Replace unmatched utf16 surrogates
This commit is contained in:
parent
1b8a77f98c
commit
f098bd029c
Notes:
github-actions[bot]
2025-07-05 14:00:10 +00:00
Author: https://github.com/Gingeh
Commit: f098bd029c
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5266
Reviewed-by: https://github.com/shannonbooth
Reviewed-by: https://github.com/trflynn89
6 changed files with 102 additions and 10 deletions
|
@ -66,15 +66,23 @@ ErrorOr<String> String::from_utf16(Utf16View const& utf16)
|
||||||
return utf16.to_utf8();
|
return utf16.to_utf8();
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<String> String::from_utf16_le(ReadonlyBytes bytes)
|
ErrorOr<String> String::from_utf16_le_with_replacement_character(ReadonlyBytes bytes)
|
||||||
{
|
{
|
||||||
if (!validate_utf16_le(bytes))
|
|
||||||
return Error::from_string_literal("String::from_utf16_le: Input was not valid UTF-16LE");
|
|
||||||
if (bytes.is_empty())
|
if (bytes.is_empty())
|
||||||
return String {};
|
return String {};
|
||||||
|
|
||||||
auto const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
|
auto const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
|
||||||
auto utf16_length = bytes.size() / 2;
|
auto utf16_length = bytes.size() / 2;
|
||||||
|
|
||||||
|
Utf16Data well_formed_utf16;
|
||||||
|
|
||||||
|
if (!validate_utf16_le(bytes)) {
|
||||||
|
well_formed_utf16.resize(bytes.size());
|
||||||
|
|
||||||
|
simdutf::to_well_formed_utf16le(utf16_data, utf16_length, well_formed_utf16.data());
|
||||||
|
utf16_data = well_formed_utf16.data();
|
||||||
|
}
|
||||||
|
|
||||||
auto utf8_length = simdutf::utf8_length_from_utf16le(utf16_data, utf16_length);
|
auto utf8_length = simdutf::utf8_length_from_utf16le(utf16_data, utf16_length);
|
||||||
|
|
||||||
String result;
|
String result;
|
||||||
|
@ -87,15 +95,23 @@ ErrorOr<String> String::from_utf16_le(ReadonlyBytes bytes)
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<String> String::from_utf16_be(ReadonlyBytes bytes)
|
ErrorOr<String> String::from_utf16_be_with_replacement_character(ReadonlyBytes bytes)
|
||||||
{
|
{
|
||||||
if (!validate_utf16_be(bytes))
|
|
||||||
return Error::from_string_literal("String::from_utf16_be: Input was not valid UTF-16BE");
|
|
||||||
if (bytes.is_empty())
|
if (bytes.is_empty())
|
||||||
return String {};
|
return String {};
|
||||||
|
|
||||||
auto const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
|
auto const* utf16_data = reinterpret_cast<char16_t const*>(bytes.data());
|
||||||
auto utf16_length = bytes.size() / 2;
|
auto utf16_length = bytes.size() / 2;
|
||||||
|
|
||||||
|
Utf16Data well_formed_utf16;
|
||||||
|
|
||||||
|
if (!validate_utf16_le(bytes)) {
|
||||||
|
well_formed_utf16.resize(bytes.size());
|
||||||
|
|
||||||
|
simdutf::to_well_formed_utf16be(utf16_data, utf16_length, well_formed_utf16.data());
|
||||||
|
utf16_data = well_formed_utf16.data();
|
||||||
|
}
|
||||||
|
|
||||||
auto utf8_length = simdutf::utf8_length_from_utf16be(utf16_data, utf16_length);
|
auto utf8_length = simdutf::utf8_length_from_utf16be(utf16_data, utf16_length);
|
||||||
|
|
||||||
String result;
|
String result;
|
||||||
|
|
|
@ -70,8 +70,8 @@ public:
|
||||||
|
|
||||||
// Creates a new String from a sequence of UTF-16 encoded code points.
|
// Creates a new String from a sequence of UTF-16 encoded code points.
|
||||||
static ErrorOr<String> from_utf16(Utf16View const&);
|
static ErrorOr<String> from_utf16(Utf16View const&);
|
||||||
static ErrorOr<String> from_utf16_le(ReadonlyBytes);
|
static ErrorOr<String> from_utf16_le_with_replacement_character(ReadonlyBytes);
|
||||||
static ErrorOr<String> from_utf16_be(ReadonlyBytes);
|
static ErrorOr<String> from_utf16_be_with_replacement_character(ReadonlyBytes);
|
||||||
|
|
||||||
// Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream.
|
// Creates a new String by reading byte_count bytes from a UTF-8 encoded Stream.
|
||||||
static ErrorOr<String> from_stream(Stream&, size_t byte_count);
|
static ErrorOr<String> from_stream(Stream&, size_t byte_count);
|
||||||
|
|
|
@ -381,7 +381,7 @@ ErrorOr<String> UTF16BEDecoder::to_utf8(StringView input)
|
||||||
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
|
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF)
|
||||||
input = input.substring_view(2);
|
input = input.substring_view(2);
|
||||||
|
|
||||||
return String::from_utf16_be(input.bytes());
|
return String::from_utf16_be_with_replacement_character(input.bytes());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool UTF16LEDecoder::validate(StringView input)
|
bool UTF16LEDecoder::validate(StringView input)
|
||||||
|
@ -395,7 +395,7 @@ ErrorOr<String> UTF16LEDecoder::to_utf8(StringView input)
|
||||||
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
|
if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE)
|
||||||
input = input.substring_view(2);
|
input = input.substring_view(2);
|
||||||
|
|
||||||
return String::from_utf16_le(input.bytes());
|
return String::from_utf16_le_with_replacement_character(input.bytes());
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<void> Latin1Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
ErrorOr<void> Latin1Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
||||||
|
|
|
@ -0,0 +1,15 @@
|
||||||
|
Harness status: OK
|
||||||
|
|
||||||
|
Found 10 tests
|
||||||
|
|
||||||
|
10 Pass
|
||||||
|
Pass utf-16le - lone surrogate lead
|
||||||
|
Pass utf-16le - lone surrogate lead (fatal flag set)
|
||||||
|
Pass utf-16le - lone surrogate trail
|
||||||
|
Pass utf-16le - lone surrogate trail (fatal flag set)
|
||||||
|
Pass utf-16le - unmatched surrogate lead
|
||||||
|
Pass utf-16le - unmatched surrogate lead (fatal flag set)
|
||||||
|
Pass utf-16le - unmatched surrogate trail
|
||||||
|
Pass utf-16le - unmatched surrogate trail (fatal flag set)
|
||||||
|
Pass utf-16le - swapped surrogate pair
|
||||||
|
Pass utf-16le - swapped surrogate pair (fatal flag set)
|
|
@ -0,0 +1,15 @@
|
||||||
|
<!doctype html>
|
||||||
|
<meta charset=utf-8>
|
||||||
|
<title>Encoding API: UTF-16 surrogate handling</title>
|
||||||
|
<script>
|
||||||
|
self.GLOBAL = {
|
||||||
|
isWindow: function() { return true; },
|
||||||
|
isWorker: function() { return false; },
|
||||||
|
isShadowRealm: function() { return false; },
|
||||||
|
};
|
||||||
|
</script>
|
||||||
|
<script src="../resources/testharness.js"></script>
|
||||||
|
<script src="../resources/testharnessreport.js"></script>
|
||||||
|
|
||||||
|
<div id=log></div>
|
||||||
|
<script src="../encoding/textdecoder-utf16-surrogates.any.js"></script>
|
|
@ -0,0 +1,46 @@
|
||||||
|
// META: global=window,dedicatedworker,shadowrealm
|
||||||
|
// META: title=Encoding API: UTF-16 surrogate handling
|
||||||
|
|
||||||
|
var bad = [
|
||||||
|
{
|
||||||
|
encoding: 'utf-16le',
|
||||||
|
input: [0x00, 0xd8],
|
||||||
|
expected: '\uFFFD',
|
||||||
|
name: 'lone surrogate lead'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
encoding: 'utf-16le',
|
||||||
|
input: [0x00, 0xdc],
|
||||||
|
expected: '\uFFFD',
|
||||||
|
name: 'lone surrogate trail'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
encoding: 'utf-16le',
|
||||||
|
input: [0x00, 0xd8, 0x00, 0x00],
|
||||||
|
expected: '\uFFFD\u0000',
|
||||||
|
name: 'unmatched surrogate lead'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
encoding: 'utf-16le',
|
||||||
|
input: [0x00, 0xdc, 0x00, 0x00],
|
||||||
|
expected: '\uFFFD\u0000',
|
||||||
|
name: 'unmatched surrogate trail'
|
||||||
|
},
|
||||||
|
{
|
||||||
|
encoding: 'utf-16le',
|
||||||
|
input: [0x00, 0xdc, 0x00, 0xd8],
|
||||||
|
expected: '\uFFFD\uFFFD',
|
||||||
|
name: 'swapped surrogate pair'
|
||||||
|
}
|
||||||
|
];
|
||||||
|
|
||||||
|
bad.forEach(function(t) {
|
||||||
|
test(function() {
|
||||||
|
assert_equals(new TextDecoder(t.encoding).decode(new Uint8Array(t.input)), t.expected);
|
||||||
|
}, t.encoding + ' - ' + t.name);
|
||||||
|
test(function() {
|
||||||
|
assert_throws_js(TypeError, function() {
|
||||||
|
new TextDecoder(t.encoding, {fatal: true}).decode(new Uint8Array(t.input))
|
||||||
|
});
|
||||||
|
}, t.encoding + ' - ' + t.name + ' (fatal flag set)');
|
||||||
|
});
|
Loading…
Add table
Add a link
Reference in a new issue