LibTextCodec: Use AK facilities to validate and convert UTF-16 to UTF-8

This allows LibTextCodec to make use of simdutf, and also reduces the number of places with manual UTF-16 implementations.
Author: https://github.com/trflynn89 Commit: 368dad54ef Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/698
2025-08-02 14:19:48 +00:00 · 2024-07-18 12:07:37 -04:00 · 2024-07-18 12:07:37 -04:00 · 368dad54ef · 2024-07-19 06:22:43 +09:00
commit 368dad54ef
parent 71c29504af
2 changed files with 30 additions and 112 deletions
--- a/Tests/LibTextCodec/TestTextDecoders.cpp
+++ b/Tests/LibTextCodec/TestTextDecoders.cpp
@ -15,6 +15,8 @@ TEST_CASE(test_utf8_decode)
    // Bytes for U+1F600 GRINNING FACE
    auto test_string = "\xf0\x9f\x98\x80"sv;

+    EXPECT(decoder.validate(test_string));
+
    Vector<u32> processed_code_points;
    MUST(decoder.process(test_string, [&](u32 code_point) {
        return processed_code_points.try_append(code_point);
@ -31,6 +33,8 @@ TEST_CASE(test_utf16be_decode)
    // This is the output of `python3 -c "print('säk😀'.encode('utf-16be'))"`.
    auto test_string = "\x00s\x00\xe4\x00k\xd8=\xde\x00"sv;

+    EXPECT(decoder.validate(test_string));
+
    Vector<u32> processed_code_points;
    MUST(decoder.process(test_string, [&](u32 code_point) {
        return processed_code_points.try_append(code_point);
@ -40,6 +44,9 @@ TEST_CASE(test_utf16be_decode)
    EXPECT(processed_code_points[1] == 0xE4);
    EXPECT(processed_code_points[2] == 0x6B);
    EXPECT(processed_code_points[3] == 0x1F600);
+
+    auto utf8 = MUST(decoder.to_utf8(test_string));
+    EXPECT_EQ(utf8, "säk😀"sv);
 }

 TEST_CASE(test_utf16le_decode)
@ -48,6 +55,8 @@ TEST_CASE(test_utf16le_decode)
    // This is the output of `python3 -c "print('säk😀'.encode('utf-16le'))"`.
    auto test_string = "s\x00\xe4\x00k\x00=\xd8\x00\xde"sv;

+    EXPECT(decoder.validate(test_string));
+
    Vector<u32> processed_code_points;
    MUST(decoder.process(test_string, [&](u32 code_point) {
        return processed_code_points.try_append(code_point);
@ -57,4 +66,7 @@ TEST_CASE(test_utf16le_decode)
    EXPECT(processed_code_points[1] == 0xE4);
    EXPECT(processed_code_points[2] == 0x6B);
    EXPECT(processed_code_points[3] == 0x1F600);
+
+    auto utf8 = MUST(decoder.to_utf8(test_string));
+    EXPECT_EQ(utf8, "säk😀"sv);
 }