diff --git a/Tests/LibTextCodec/TestTextEncoders.cpp b/Tests/LibTextCodec/TestTextEncoders.cpp index 8ed0759bb68..499de5d2a88 100644 --- a/Tests/LibTextCodec/TestTextEncoders.cpp +++ b/Tests/LibTextCodec/TestTextEncoders.cpp @@ -153,3 +153,22 @@ TEST_CASE(test_gb18030_encoder) EXPECT(processed_bytes[2] == 0xFE); EXPECT(processed_bytes[3] == 0xFE); } + +TEST_CASE(test_windows1252_encoder) +{ + auto encoder = TextCodec::encoder_for_exact_name("windows-1252"sv); + auto test_string = "ÀÁÂÃÄÅÆÇÈÉÊËÌÍÎÏfoo€"sv; + Vector processed_bytes; + MUST(encoder.value().process( + Utf8View(test_string), + [&](u8 byte) { dbgln("{}", processed_bytes.size()); return processed_bytes.try_append(byte); }, + [&](u32) -> ErrorOr { EXPECT(false); return {}; })); + EXPECT(processed_bytes.size() == 20); + for (u8 i = 0; i < 15; i++) { + EXPECT(processed_bytes[i] == (0xC0 + i)); + } + EXPECT(processed_bytes[16] == 0x66); + EXPECT(processed_bytes[17] == 0x6F); + EXPECT(processed_bytes[18] == 0x6F); + EXPECT(processed_bytes[19] == 0x80); +} diff --git a/Userland/Libraries/LibTextCodec/Encoder.cpp b/Userland/Libraries/LibTextCodec/Encoder.cpp index 540e3e782bd..fadf264d551 100644 --- a/Userland/Libraries/LibTextCodec/Encoder.cpp +++ b/Userland/Libraries/LibTextCodec/Encoder.cpp @@ -22,6 +22,37 @@ EUCJPEncoder s_euc_jp_encoder; ISO2022JPEncoder s_iso_2022_jp_encoder; ShiftJISEncoder s_shift_jis_encoder; EUCKREncoder s_euc_kr_encoder; + +// s_{encoding}_index is generated from https://encoding.spec.whatwg.org/indexes.json +// Found separately in https://encoding.spec.whatwg.org/index-{encoding}.txt +SingleByteEncoder s_ibm866_encoder { s_ibm866_index }; +SingleByteEncoder s_latin2_encoder { s_iso_8859_2_index }; +SingleByteEncoder s_latin3_encoder { s_iso_8859_3_index }; +SingleByteEncoder s_latin4_encoder { s_iso_8859_4_index }; +SingleByteEncoder s_latin_cyrillic_encoder { s_iso_8859_5_index }; +SingleByteEncoder s_latin_arabic_encoder { s_iso_8859_6_index }; +SingleByteEncoder s_latin_greek_encoder { s_iso_8859_7_index }; +SingleByteEncoder s_latin_hebrew_encoder { s_iso_8859_8_index }; +SingleByteEncoder s_latin6_encoder { s_iso_8859_10_index }; +SingleByteEncoder s_latin7_encoder { s_iso_8859_13_index }; +SingleByteEncoder s_latin8_encoder { s_iso_8859_14_index }; +SingleByteEncoder s_latin9_encoder { s_iso_8859_15_index }; +SingleByteEncoder s_latin10_encoder { s_iso_8859_16_index }; +SingleByteEncoder s_centraleurope_encoder { s_windows_1250_index }; +SingleByteEncoder s_cyrillic_encoder { s_windows_1251_index }; +SingleByteEncoder s_hebrew_encoder { s_windows_1255_index }; +SingleByteEncoder s_koi8r_encoder { s_koi8_r_index }; +SingleByteEncoder s_koi8u_encoder { s_koi8_u_index }; +SingleByteEncoder s_mac_roman_encoder { s_macintosh_index }; +SingleByteEncoder s_windows874_encoder { s_windows_874_index }; +SingleByteEncoder s_windows1252_encoder { s_windows_1252_index }; +SingleByteEncoder s_windows1253_encoder { s_windows_1253_index }; +SingleByteEncoder s_turkish_encoder { s_windows_1254_index }; +SingleByteEncoder s_windows1256_encoder { s_windows_1256_index }; +SingleByteEncoder s_windows1257_encoder { s_windows_1257_index }; +SingleByteEncoder s_windows1258_encoder { s_windows_1258_index }; +SingleByteEncoder s_mac_cyrillic_encoder { s_x_mac_cyrillic_index }; + } Optional encoder_for_exact_name(StringView encoding) @@ -42,6 +73,60 @@ Optional encoder_for_exact_name(StringView encoding) return s_gb18030_encoder; if (encoding.equals_ignoring_ascii_case("gbk"sv)) return s_gbk_encoder; + if (encoding.equals_ignoring_ascii_case("ibm866"sv)) + return s_ibm866_encoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-2"sv)) + return s_latin2_encoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-3"sv)) + return s_latin3_encoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-4"sv)) + return s_latin4_encoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-5"sv)) + return s_latin_cyrillic_encoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-6"sv)) + return s_latin_arabic_encoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-7"sv)) + return s_latin_greek_encoder; + if (encoding.is_one_of_ignoring_ascii_case("iso-8859-8"sv, "iso-8859-8-i"sv)) + return s_latin_hebrew_encoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-10"sv)) + return s_latin6_encoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-13"sv)) + return s_latin7_encoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-14"sv)) + return s_latin8_encoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-15"sv)) + return s_latin9_encoder; + if (encoding.equals_ignoring_ascii_case("iso-8859-16"sv)) + return s_latin10_encoder; + if (encoding.equals_ignoring_ascii_case("koi8-r"sv)) + return s_koi8r_encoder; + if (encoding.equals_ignoring_ascii_case("koi8-u"sv)) + return s_koi8u_encoder; + if (encoding.equals_ignoring_ascii_case("macintosh"sv)) + return s_mac_roman_encoder; + if (encoding.equals_ignoring_ascii_case("windows-874"sv)) + return s_windows874_encoder; + if (encoding.equals_ignoring_ascii_case("windows-1250"sv)) + return s_centraleurope_encoder; + if (encoding.equals_ignoring_ascii_case("windows-1251"sv)) + return s_cyrillic_encoder; + if (encoding.equals_ignoring_ascii_case("windows-1252"sv)) + return s_windows1252_encoder; + if (encoding.equals_ignoring_ascii_case("windows-1253"sv)) + return s_windows1253_encoder; + if (encoding.equals_ignoring_ascii_case("windows-1254"sv)) + return s_turkish_encoder; + if (encoding.equals_ignoring_ascii_case("windows-1255"sv)) + return s_hebrew_encoder; + if (encoding.equals_ignoring_ascii_case("windows-1256"sv)) + return s_windows1256_encoder; + if (encoding.equals_ignoring_ascii_case("windows-1257"sv)) + return s_windows1257_encoder; + if (encoding.equals_ignoring_ascii_case("windows-1258"sv)) + return s_windows1258_encoder; + if (encoding.equals_ignoring_ascii_case("x-mac-cyrillic"sv)) + return s_mac_cyrillic_encoder; dbgln("TextCodec: No encoder implemented for encoding '{}'", encoding); return {}; } @@ -551,4 +636,34 @@ ErrorOr GB18030Encoder::process(Utf8View input, Function(u8) return {}; } +// https://encoding.spec.whatwg.org/#single-byte-encoder +template +ErrorOr SingleByteEncoder::process(Utf8View input, Function(u8)> on_byte, Function(u32)> on_error) +{ + for (u32 const code_point : input) { + if (code_point < 0x80) { + // 2. If code point is an ASCII code point, return a byte whose value is code point. + TRY(on_byte(static_cast(code_point))); + } else { + Optional pointer = {}; + for (u8 i = 0; i < m_translation_table.size(); i++) { + if (m_translation_table[i] == code_point) { + // 3. Let pointer be the index pointer for code point in index single-byte. + pointer = i; + break; + } + } + if (pointer.has_value()) { + // 5. Return a byte whose value is pointer + 0x80. + TRY(on_byte(pointer.value() + 0x80)); + } else { + // 4. If pointer is null, return error with code point. + TRY(on_error(code_point)); + } + } + } + // 1. If code point is end-of-queue, return finished. + return {}; +} + } diff --git a/Userland/Libraries/LibTextCodec/Encoder.h b/Userland/Libraries/LibTextCodec/Encoder.h index 8241fb67153..32bc0212715 100644 --- a/Userland/Libraries/LibTextCodec/Encoder.h +++ b/Userland/Libraries/LibTextCodec/Encoder.h @@ -72,6 +72,19 @@ public: private: IsGBK m_is_gbk { IsGBK::No }; }; +template +class SingleByteEncoder final : public Encoder { +public: + SingleByteEncoder(Array translation_table) + : m_translation_table(translation_table) + { + } + + virtual ErrorOr process(Utf8View, Function(u8)> on_byte, Function(u32)> on_error) override; + +private: + Array m_translation_table; +}; Optional encoder_for_exact_name(StringView encoding); Optional encoder_for(StringView label);