From 368dad54ef2a56d52f66d1b39d4aa15c574fee3a Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Thu, 18 Jul 2024 12:07:37 -0400 Subject: [PATCH] LibTextCodec: Use AK facilities to validate and convert UTF-16 to UTF-8 This allows LibTextCodec to make use of simdutf, and also reduces the number of places with manual UTF-16 implementations. --- Tests/LibTextCodec/TestTextDecoders.cpp | 12 ++ Userland/Libraries/LibTextCodec/Decoder.cpp | 130 +++----------------- 2 files changed, 30 insertions(+), 112 deletions(-) diff --git a/Tests/LibTextCodec/TestTextDecoders.cpp b/Tests/LibTextCodec/TestTextDecoders.cpp index d1658d337cd..98b20bc6bd0 100644 --- a/Tests/LibTextCodec/TestTextDecoders.cpp +++ b/Tests/LibTextCodec/TestTextDecoders.cpp @@ -15,6 +15,8 @@ TEST_CASE(test_utf8_decode) // Bytes for U+1F600 GRINNING FACE auto test_string = "\xf0\x9f\x98\x80"sv; + EXPECT(decoder.validate(test_string)); + Vector processed_code_points; MUST(decoder.process(test_string, [&](u32 code_point) { return processed_code_points.try_append(code_point); @@ -31,6 +33,8 @@ TEST_CASE(test_utf16be_decode) // This is the output of `python3 -c "print('säk😀'.encode('utf-16be'))"`. auto test_string = "\x00s\x00\xe4\x00k\xd8=\xde\x00"sv; + EXPECT(decoder.validate(test_string)); + Vector processed_code_points; MUST(decoder.process(test_string, [&](u32 code_point) { return processed_code_points.try_append(code_point); @@ -40,6 +44,9 @@ TEST_CASE(test_utf16be_decode) EXPECT(processed_code_points[1] == 0xE4); EXPECT(processed_code_points[2] == 0x6B); EXPECT(processed_code_points[3] == 0x1F600); + + auto utf8 = MUST(decoder.to_utf8(test_string)); + EXPECT_EQ(utf8, "säk😀"sv); } TEST_CASE(test_utf16le_decode) @@ -48,6 +55,8 @@ TEST_CASE(test_utf16le_decode) // This is the output of `python3 -c "print('säk😀'.encode('utf-16le'))"`. auto test_string = "s\x00\xe4\x00k\x00=\xd8\x00\xde"sv; + EXPECT(decoder.validate(test_string)); + Vector processed_code_points; MUST(decoder.process(test_string, [&](u32 code_point) { return processed_code_points.try_append(code_point); @@ -57,4 +66,7 @@ TEST_CASE(test_utf16le_decode) EXPECT(processed_code_points[1] == 0xE4); EXPECT(processed_code_points[2] == 0x6B); EXPECT(processed_code_points[3] == 0x1F600); + + auto utf8 = MUST(decoder.to_utf8(test_string)); + EXPECT_EQ(utf8, "säk😀"sv); } diff --git a/Userland/Libraries/LibTextCodec/Decoder.cpp b/Userland/Libraries/LibTextCodec/Decoder.cpp index e187a233a0b..0fcb526e149 100644 --- a/Userland/Libraries/LibTextCodec/Decoder.cpp +++ b/Userland/Libraries/LibTextCodec/Decoder.cpp @@ -375,150 +375,56 @@ ErrorOr UTF8Decoder::to_utf8(StringView input) return Decoder::to_utf8(bomless_input); } +static Utf16View as_utf16(StringView view, AK::Endianness endianness) +{ + return Utf16View { + { reinterpret_cast(view.bytes().data()), view.length() / 2 }, + endianness + }; +} + ErrorOr UTF16BEDecoder::process(StringView input, Function(u32)> on_code_point) { - // rfc2781, 2.2 Decoding UTF-16 - size_t utf16_length = input.length() - (input.length() % 2); - for (size_t i = 0; i < utf16_length; i += 2) { - // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value - // of W1. Terminate. - u16 w1 = (static_cast(input[i]) << 8) | static_cast(input[i + 1]); - if (!is_unicode_surrogate(w1)) { - TRY(on_code_point(w1)); - continue; - } - - // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence - // is in error and no valid character can be obtained using W1. - // Terminate. - // 3) If there is no W2 (that is, the sequence ends with W1), or if W2 - // is not between 0xDC00 and 0xDFFF, the sequence is in error. - // Terminate. - if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) { - TRY(on_code_point(replacement_code_point)); - continue; - } - - u16 w2 = (static_cast(input[i + 2]) << 8) | static_cast(input[i + 3]); - if (!Utf16View::is_low_surrogate(w2)) { - TRY(on_code_point(replacement_code_point)); - continue; - } - - // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order - // bits of W1 as its 10 high-order bits and the 10 low-order bits of - // W2 as its 10 low-order bits. - // 5) Add 0x10000 to U' to obtain the character value U. Terminate. - TRY(on_code_point(Utf16View::decode_surrogate_pair(w1, w2))); - i += 2; - } + for (auto code_point : as_utf16(input, AK::Endianness::Big)) + TRY(on_code_point(code_point)); return {}; } bool UTF16BEDecoder::validate(StringView input) { - size_t utf16_length = input.length() - (input.length() % 2); - for (size_t i = 0; i < utf16_length; i += 2) { - u16 w1 = (static_cast(input[i]) << 8) | static_cast(input[i + 1]); - if (!is_unicode_surrogate(w1)) - continue; - - if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) - return false; - - u16 w2 = (static_cast(input[i + 2]) << 8) | static_cast(input[i + 3]); - if (!Utf16View::is_low_surrogate(w2)) - return false; - - i += 2; - } - return true; + return as_utf16(input, AK::Endianness::Big).validate(); } ErrorOr UTF16BEDecoder::to_utf8(StringView input) { // Discard the BOM - auto bomless_input = input; if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFE && bytes[1] == 0xFF) - bomless_input = input.substring_view(2); + input = input.substring_view(2); - StringBuilder builder(bomless_input.length() / 2); - TRY(process(bomless_input, [&builder](u32 c) { return builder.try_append_code_point(c); })); - return builder.to_string(); + return String::from_utf16(as_utf16(input, AK::Endianness::Big)); } ErrorOr UTF16LEDecoder::process(StringView input, Function(u32)> on_code_point) { - // rfc2781, 2.2 Decoding UTF-16 - size_t utf16_length = input.length() - (input.length() % 2); - for (size_t i = 0; i < utf16_length; i += 2) { - // 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value - // of W1. Terminate. - u16 w1 = static_cast(input[i]) | (static_cast(input[i + 1]) << 8); - if (!is_unicode_surrogate(w1)) { - TRY(on_code_point(w1)); - continue; - } - - // 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence - // is in error and no valid character can be obtained using W1. - // Terminate. - // 3) If there is no W2 (that is, the sequence ends with W1), or if W2 - // is not between 0xDC00 and 0xDFFF, the sequence is in error. - // Terminate. - if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) { - TRY(on_code_point(replacement_code_point)); - continue; - } - - u16 w2 = static_cast(input[i + 2]) | (static_cast(input[i + 3]) << 8); - if (!Utf16View::is_low_surrogate(w2)) { - TRY(on_code_point(replacement_code_point)); - continue; - } - - // 4) Construct a 20-bit unsigned integer U', taking the 10 low-order - // bits of W1 as its 10 high-order bits and the 10 low-order bits of - // W2 as its 10 low-order bits. - // 5) Add 0x10000 to U' to obtain the character value U. Terminate. - TRY(on_code_point(Utf16View::decode_surrogate_pair(w1, w2))); - i += 2; - } + for (auto code_point : as_utf16(input, AK::Endianness::Little)) + TRY(on_code_point(code_point)); return {}; } bool UTF16LEDecoder::validate(StringView input) { - size_t utf16_length = input.length() - (input.length() % 2); - for (size_t i = 0; i < utf16_length; i += 2) { - u16 w1 = static_cast(input[i]) | (static_cast(input[i + 1]) << 8); - if (!is_unicode_surrogate(w1)) - continue; - - if (!Utf16View::is_high_surrogate(w1) || i + 2 == utf16_length) - return false; - - u16 w2 = static_cast(input[i + 2]) | (static_cast(input[i + 3]) << 8); - if (!Utf16View::is_low_surrogate(w2)) - return false; - - i += 2; - } - return true; + return as_utf16(input, AK::Endianness::Little).validate(); } ErrorOr UTF16LEDecoder::to_utf8(StringView input) { // Discard the BOM - auto bomless_input = input; if (auto bytes = input.bytes(); bytes.size() >= 2 && bytes[0] == 0xFF && bytes[1] == 0xFE) - bomless_input = input.substring_view(2); + input = input.substring_view(2); - StringBuilder builder(bomless_input.length() / 2); - TRY(process(bomless_input, [&builder](u32 c) { return builder.try_append_code_point(c); })); - return builder.to_string(); + return String::from_utf16(as_utf16(input, AK::Endianness::Little)); } ErrorOr Latin1Decoder::process(StringView input, Function(u32)> on_code_point)