From 0e0f98a45e32f39d7f703269a8cabff1010843ca Mon Sep 17 00:00:00 2001 From: Luke Wilde Date: Fri, 11 Feb 2022 20:38:44 +0000 Subject: [PATCH] LibTextCodec: Add x-user-defined decoder It's a pretty simple charset: the bottom 128 bytes (0x00-0x7F) are standard ASCII, while the top 128 bytes (0x80-0xFF) are mapped to a portion of the Unicode Private Use Area, specifically 0xF780-0xF7FF. This is used by Google Maps for certain blobs. --- Userland/Libraries/LibTextCodec/Decoder.cpp | 25 +++++++++++++++++++++ Userland/Libraries/LibTextCodec/Decoder.h | 5 +++++ 2 files changed, 30 insertions(+) diff --git a/Userland/Libraries/LibTextCodec/Decoder.cpp b/Userland/Libraries/LibTextCodec/Decoder.cpp index a212136fa35..35be9c52efd 100644 --- a/Userland/Libraries/LibTextCodec/Decoder.cpp +++ b/Userland/Libraries/LibTextCodec/Decoder.cpp @@ -20,6 +20,7 @@ CyrillicDecoder s_cyrillic_decoder; Koi8RDecoder s_koi8r_decoder; Latin9Decoder s_latin9_decoder; TurkishDecoder s_turkish_decoder; +XUserDefinedDecoder s_x_user_defined_decoder; } Decoder* decoder_for(const String& a_encoding) @@ -44,6 +45,8 @@ Decoder* decoder_for(const String& a_encoding) return &s_latin9_decoder; if (encoding.value().equals_ignoring_case("windows-1254")) return &s_turkish_decoder; + if (encoding.value().equals_ignoring_case("x-user-defined")) + return &s_x_user_defined_decoder; } dbgln("TextCodec: No decoder implemented for encoding '{}'", a_encoding); return nullptr; @@ -466,4 +469,26 @@ void TurkishDecoder::process(StringView input, Function on_code_point } } +// https://encoding.spec.whatwg.org/#x-user-defined-decoder +void XUserDefinedDecoder::process(StringView input, Function on_code_point) +{ + auto convert_x_user_defined_to_utf8 = [](u8 ch) -> u32 { + // 2. If byte is an ASCII byte, return a code point whose value is byte. + // https://infra.spec.whatwg.org/#ascii-byte + // An ASCII byte is a byte in the range 0x00 (NUL) to 0x7F (DEL), inclusive. + // NOTE: This doesn't check for ch >= 0x00, as that would always be true due to being unsigned. + if (ch <= 0x7f) + return ch; + + // 3. Return a code point whose value is 0xF780 + byte − 0x80. + return 0xF780 + ch - 0x80; + }; + + for (auto ch : input) { + on_code_point(convert_x_user_defined_to_utf8(ch)); + } + + // 1. If byte is end-of-queue, return finished. +} + } diff --git a/Userland/Libraries/LibTextCodec/Decoder.h b/Userland/Libraries/LibTextCodec/Decoder.h index a4b1e68dd25..7c7c47c7fbf 100644 --- a/Userland/Libraries/LibTextCodec/Decoder.h +++ b/Userland/Libraries/LibTextCodec/Decoder.h @@ -67,6 +67,11 @@ public: virtual void process(StringView, Function on_code_point) override; }; +class XUserDefinedDecoder final : public Decoder { +public: + virtual void process(StringView, Function on_code_point) override; +}; + Decoder* decoder_for(String const& encoding); Optional get_standardized_encoding(const String& encoding);