mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-07-31 13:19:05 +00:00
LibTextCodec: Add GBK/GB18030 decoder
Includes changes from GB-18030-2022, which are not yet included in the Encoding Specification, but WebKit, Blink and WPT are already updated.
This commit is contained in:
parent
9ed52504ab
commit
2ce61fe6ea
Notes:
sideshowbarker
2024-07-17 03:35:16 +09:00
Author: https://github.com/skyrising
Commit: 2ce61fe6ea
Pull-request: https://github.com/SerenityOS/serenity/pull/24485
Reviewed-by: https://github.com/ADKaster
8 changed files with 497 additions and 0 deletions
15
Meta/CMake/libtextcodec_generators.cmake
Normal file
15
Meta/CMake/libtextcodec_generators.cmake
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
function (generate_encoding_indexes)
|
||||||
|
set(LIBTEXTCODEC_INPUT_FOLDER "${CMAKE_CURRENT_SOURCE_DIR}")
|
||||||
|
|
||||||
|
# indexes.json can be found at https://encoding.spec.whatwg.org/indexes.json
|
||||||
|
invoke_generator(
|
||||||
|
"LookupTables.cpp"
|
||||||
|
Lagom::GenerateEncodingIndexes
|
||||||
|
"${LIBTEXTCODEC_INPUT_FOLDER}/indexes.json"
|
||||||
|
"LookupTables.h"
|
||||||
|
"LookupTables.cpp"
|
||||||
|
arguments -j "${LIBTEXTCODEC_INPUT_FOLDER}/indexes.json"
|
||||||
|
)
|
||||||
|
|
||||||
|
install(FILES "${CMAKE_CURRENT_BINARY_DIR}/LookupTables.h" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/LibTextCodec/")
|
||||||
|
endfunction()
|
|
@ -6,6 +6,7 @@ endif()
|
||||||
add_subdirectory(LibEDID)
|
add_subdirectory(LibEDID)
|
||||||
add_subdirectory(LibGL)
|
add_subdirectory(LibGL)
|
||||||
add_subdirectory(LibLocale)
|
add_subdirectory(LibLocale)
|
||||||
|
add_subdirectory(LibTextCodec)
|
||||||
add_subdirectory(LibTimeZone)
|
add_subdirectory(LibTimeZone)
|
||||||
add_subdirectory(LibUnicode)
|
add_subdirectory(LibUnicode)
|
||||||
add_subdirectory(LibWeb)
|
add_subdirectory(LibWeb)
|
||||||
|
|
|
@ -0,0 +1 @@
|
||||||
|
lagom_tool(GenerateEncodingIndexes SOURCES GenerateEncodingIndexes.cpp LIBS LibMain)
|
|
@ -0,0 +1,220 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2024, Simon Wanner <simon@skyrising.xyz>
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <AK/Array.h>
|
||||||
|
#include <AK/JsonObject.h>
|
||||||
|
#include <AK/NumericLimits.h>
|
||||||
|
#include <AK/SourceGenerator.h>
|
||||||
|
#include <AK/StringBuilder.h>
|
||||||
|
#include <AK/StringView.h>
|
||||||
|
#include <AK/Vector.h>
|
||||||
|
#include <LibCore/ArgsParser.h>
|
||||||
|
#include <LibCore/File.h>
|
||||||
|
#include <LibMain/Main.h>
|
||||||
|
|
||||||
|
struct LookupTable {
|
||||||
|
u32 first_pointer;
|
||||||
|
u32 max_code_point;
|
||||||
|
Vector<u32> code_points;
|
||||||
|
};
|
||||||
|
|
||||||
|
struct LookupTables {
|
||||||
|
JsonArray const& gb18030_ranges;
|
||||||
|
OrderedHashMap<StringView, LookupTable> indexes;
|
||||||
|
};
|
||||||
|
|
||||||
|
LookupTable prepare_table(JsonArray const& data)
|
||||||
|
{
|
||||||
|
Vector<u32> code_points;
|
||||||
|
code_points.ensure_capacity(data.size());
|
||||||
|
u32 max = 0;
|
||||||
|
u32 first_pointer = 0;
|
||||||
|
for (auto const& entry : data.values()) {
|
||||||
|
if (entry.is_null()) {
|
||||||
|
if (code_points.is_empty()) {
|
||||||
|
first_pointer++;
|
||||||
|
} else {
|
||||||
|
code_points.append(0xfffd);
|
||||||
|
max = AK::max(max, code_points.last());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
code_points.append(entry.as_integer<u32>());
|
||||||
|
max = AK::max(max, code_points.last());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
while (code_points.last() == 0xfffd)
|
||||||
|
code_points.take_last();
|
||||||
|
return { first_pointer, max, move(code_points) };
|
||||||
|
}
|
||||||
|
|
||||||
|
void generate_table(SourceGenerator generator, StringView name, LookupTable& table)
|
||||||
|
{
|
||||||
|
generator.set("name", name);
|
||||||
|
generator.set("value_type", table.max_code_point > NumericLimits<u16>::max() ? "u32" : "u16");
|
||||||
|
generator.set("first_pointer", MUST(String::number(table.first_pointer)));
|
||||||
|
generator.set("size", MUST(String::number(table.code_points.size())));
|
||||||
|
|
||||||
|
if (table.first_pointer > 0) {
|
||||||
|
generator.appendln("static constexpr u32 s_@name@_index_first_pointer = @first_pointer@;");
|
||||||
|
}
|
||||||
|
|
||||||
|
generator.append("static constexpr Array<@value_type@, @size@> s_@name@_index {\n ");
|
||||||
|
for (size_t i = 0; i < table.code_points.size(); i++) {
|
||||||
|
generator.append(MUST(String::formatted("{:#04x}", table.code_points[i])));
|
||||||
|
if (i != table.code_points.size() - 1)
|
||||||
|
generator.append(i % 16 == 15 ? ",\n "sv : ", "sv);
|
||||||
|
}
|
||||||
|
generator.appendln("\n};");
|
||||||
|
generator.appendln("Optional<u32> index_@name@_code_point(u32 pointer);");
|
||||||
|
}
|
||||||
|
|
||||||
|
ErrorOr<void> generate_header_file(LookupTables& tables, Core::File& file)
|
||||||
|
{
|
||||||
|
StringBuilder builder;
|
||||||
|
SourceGenerator generator { builder };
|
||||||
|
|
||||||
|
generator.set("gb18030_ranges_size", MUST(String::number(tables.gb18030_ranges.size())));
|
||||||
|
|
||||||
|
generator.append(R"~~~(
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <AK/Array.h>
|
||||||
|
#include <AK/Types.h>
|
||||||
|
|
||||||
|
namespace TextCodec {
|
||||||
|
|
||||||
|
struct Gb18030RangeEntry {
|
||||||
|
u32 pointer;
|
||||||
|
u32 code_point;
|
||||||
|
};
|
||||||
|
|
||||||
|
static constexpr Array<Gb18030RangeEntry, @gb18030_ranges_size@> s_gb18030_ranges { {
|
||||||
|
)~~~");
|
||||||
|
|
||||||
|
for (auto const& range : tables.gb18030_ranges.values()) {
|
||||||
|
generator.appendln(MUST(String::formatted(" {{ {}, {:#04x} }},", range.as_array()[0].as_integer<u32>(), range.as_array()[1].as_integer<u32>())));
|
||||||
|
}
|
||||||
|
generator.appendln("} };\n");
|
||||||
|
|
||||||
|
for (auto e : tables.indexes) {
|
||||||
|
generate_table(generator.fork(), e.key, e.value);
|
||||||
|
}
|
||||||
|
|
||||||
|
generator.append("\n");
|
||||||
|
generator.appendln("}");
|
||||||
|
|
||||||
|
TRY(file.write_until_depleted(generator.as_string_view().bytes()));
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
void generate_table_implementation(SourceGenerator generator, StringView name, LookupTable& table)
|
||||||
|
{
|
||||||
|
generator.set("name", name);
|
||||||
|
generator.set("first_pointer", MUST(String::number(table.first_pointer)));
|
||||||
|
generator.set("size", MUST(String::number(table.code_points.size())));
|
||||||
|
|
||||||
|
if (table.first_pointer > 0) {
|
||||||
|
generator.append(R"~~~(
|
||||||
|
Optional<u32> index_@name@_code_point(u32 pointer)
|
||||||
|
{
|
||||||
|
if (pointer < s_@name@_index_first_pointer || pointer - s_@name@_index_first_pointer >= s_@name@_index.size())
|
||||||
|
return {};
|
||||||
|
auto value = s_@name@_index[pointer - s_@name@_index_first_pointer];
|
||||||
|
if (value == 0xfffd)
|
||||||
|
return {};
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
)~~~");
|
||||||
|
} else {
|
||||||
|
generator.append(R"~~~(
|
||||||
|
Optional<u32> index_@name@_code_point(u32 pointer)
|
||||||
|
{
|
||||||
|
if (pointer >= s_@name@_index.size())
|
||||||
|
return {};
|
||||||
|
auto value = s_@name@_index[pointer];
|
||||||
|
if (value == 0xfffd)
|
||||||
|
return {};
|
||||||
|
return value;
|
||||||
|
}
|
||||||
|
)~~~");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
ErrorOr<void> generate_implementation_file(LookupTables& tables, Core::File& file)
|
||||||
|
{
|
||||||
|
StringBuilder builder;
|
||||||
|
SourceGenerator generator { builder };
|
||||||
|
|
||||||
|
generator.append(R"~~~(
|
||||||
|
#include <LibTextCodec/LookupTables.h>
|
||||||
|
|
||||||
|
namespace TextCodec {
|
||||||
|
)~~~");
|
||||||
|
|
||||||
|
for (auto e : tables.indexes) {
|
||||||
|
generate_table_implementation(generator.fork(), e.key, e.value);
|
||||||
|
}
|
||||||
|
|
||||||
|
generator.appendln("\n}");
|
||||||
|
|
||||||
|
TRY(file.write_until_depleted(generator.as_string_view().bytes()));
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
ErrorOr<int> serenity_main(Main::Arguments arguments)
|
||||||
|
{
|
||||||
|
StringView generated_header_path;
|
||||||
|
StringView generated_implementation_path;
|
||||||
|
StringView json_path;
|
||||||
|
|
||||||
|
Core::ArgsParser args_parser;
|
||||||
|
args_parser.add_option(generated_header_path, "Path to the lookup table header file to generate", "generated-header-path", 'h', "generated-header-path");
|
||||||
|
args_parser.add_option(generated_implementation_path, "Path to the lookup table implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
|
||||||
|
args_parser.add_option(json_path, "Path to the JSON file to read from", "json-path", 'j', "json-path");
|
||||||
|
args_parser.parse(arguments);
|
||||||
|
|
||||||
|
auto json_file = TRY(Core::File::open(json_path, Core::File::OpenMode::Read));
|
||||||
|
auto json_data = TRY(json_file->read_until_eof());
|
||||||
|
auto data = TRY(JsonValue::from_string(json_data)).as_object();
|
||||||
|
|
||||||
|
auto gb18030_table = prepare_table(data.get("gb18030"sv)->as_array());
|
||||||
|
|
||||||
|
// FIXME: Encoding specification is not updated to GB-18030-2022 yet (https://github.com/whatwg/encoding/issues/312)
|
||||||
|
// NOTE: See https://commits.webkit.org/264918@main
|
||||||
|
gb18030_table.code_points[7182] = 0xfe10;
|
||||||
|
gb18030_table.code_points[7183] = 0xfe12;
|
||||||
|
gb18030_table.code_points[7184] = 0xfe11;
|
||||||
|
gb18030_table.code_points[7185] = 0xfe13;
|
||||||
|
gb18030_table.code_points[7186] = 0xfe14;
|
||||||
|
gb18030_table.code_points[7187] = 0xfe15;
|
||||||
|
gb18030_table.code_points[7188] = 0xfe16;
|
||||||
|
gb18030_table.code_points[7201] = 0xfe17;
|
||||||
|
gb18030_table.code_points[7202] = 0xfe18;
|
||||||
|
gb18030_table.code_points[7208] = 0xfe19;
|
||||||
|
gb18030_table.code_points[23775] = 0x9fb4;
|
||||||
|
gb18030_table.code_points[23783] = 0x9fb5;
|
||||||
|
gb18030_table.code_points[23788] = 0x9fb6;
|
||||||
|
gb18030_table.code_points[23789] = 0x9fb7;
|
||||||
|
gb18030_table.code_points[23795] = 0x9fb8;
|
||||||
|
gb18030_table.code_points[23812] = 0x9fb9;
|
||||||
|
gb18030_table.code_points[23829] = 0x9fba;
|
||||||
|
gb18030_table.code_points[23845] = 0x9fbb;
|
||||||
|
|
||||||
|
LookupTables tables {
|
||||||
|
.gb18030_ranges = data.get("gb18030-ranges"sv)->as_array(),
|
||||||
|
.indexes = {
|
||||||
|
{ "gb18030"sv, move(gb18030_table) },
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
auto generated_header_file = TRY(Core::File::open(generated_header_path, Core::File::OpenMode::Write));
|
||||||
|
auto generated_implementation_file = TRY(Core::File::open(generated_implementation_path, Core::File::OpenMode::Write));
|
||||||
|
|
||||||
|
TRY(generate_header_file(tables, *generated_header_file));
|
||||||
|
TRY(generate_implementation_file(tables, *generated_implementation_file));
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
|
@ -1,5 +1,13 @@
|
||||||
|
include(libtextcodec_generators)
|
||||||
|
|
||||||
set(SOURCES
|
set(SOURCES
|
||||||
Decoder.cpp
|
Decoder.cpp
|
||||||
)
|
)
|
||||||
|
|
||||||
|
generate_encoding_indexes()
|
||||||
|
|
||||||
|
set(GENERATED_SOURCES
|
||||||
|
LookupTables.cpp
|
||||||
|
)
|
||||||
|
|
||||||
serenity_lib(LibTextCodec textcodec)
|
serenity_lib(LibTextCodec textcodec)
|
||||||
|
|
|
@ -7,10 +7,12 @@
|
||||||
* SPDX-License-Identifier: BSD-2-Clause
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#include <AK/BinarySearch.h>
|
||||||
#include <AK/StringBuilder.h>
|
#include <AK/StringBuilder.h>
|
||||||
#include <AK/Utf16View.h>
|
#include <AK/Utf16View.h>
|
||||||
#include <AK/Utf8View.h>
|
#include <AK/Utf8View.h>
|
||||||
#include <LibTextCodec/Decoder.h>
|
#include <LibTextCodec/Decoder.h>
|
||||||
|
#include <LibTextCodec/LookupTables.h>
|
||||||
|
|
||||||
namespace TextCodec {
|
namespace TextCodec {
|
||||||
|
|
||||||
|
@ -26,6 +28,7 @@ Latin9Decoder s_latin9_decoder;
|
||||||
PDFDocEncodingDecoder s_pdf_doc_encoding_decoder;
|
PDFDocEncodingDecoder s_pdf_doc_encoding_decoder;
|
||||||
TurkishDecoder s_turkish_decoder;
|
TurkishDecoder s_turkish_decoder;
|
||||||
XUserDefinedDecoder s_x_user_defined_decoder;
|
XUserDefinedDecoder s_x_user_defined_decoder;
|
||||||
|
GB18030Decoder s_gb18030_decoder;
|
||||||
|
|
||||||
// clang-format off
|
// clang-format off
|
||||||
// https://encoding.spec.whatwg.org/index-ibm866.txt
|
// https://encoding.spec.whatwg.org/index-ibm866.txt
|
||||||
|
@ -297,6 +300,10 @@ Optional<Decoder&> decoder_for(StringView a_encoding)
|
||||||
return s_utf16be_decoder;
|
return s_utf16be_decoder;
|
||||||
if (encoding.value().equals_ignoring_ascii_case("utf-16le"sv))
|
if (encoding.value().equals_ignoring_ascii_case("utf-16le"sv))
|
||||||
return s_utf16le_decoder;
|
return s_utf16le_decoder;
|
||||||
|
if (encoding.value().equals_ignoring_ascii_case("gbk"sv))
|
||||||
|
return s_gb18030_decoder;
|
||||||
|
if (encoding.value().equals_ignoring_ascii_case("gb18030"sv))
|
||||||
|
return s_gb18030_decoder;
|
||||||
if (encoding.value().equals_ignoring_ascii_case("ibm866"sv))
|
if (encoding.value().equals_ignoring_ascii_case("ibm866"sv))
|
||||||
return s_ibm866_decoder;
|
return s_ibm866_decoder;
|
||||||
if (encoding.value().equals_ignoring_ascii_case("iso-8859-2"sv))
|
if (encoding.value().equals_ignoring_ascii_case("iso-8859-2"sv))
|
||||||
|
@ -972,4 +979,208 @@ ErrorOr<void> SingleByteDecoder::process(StringView input, Function<ErrorOr<void
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// https://encoding.spec.whatwg.org/#index-gb18030-ranges-code-point
|
||||||
|
static Optional<u32> index_gb18030_ranges_code_point(u32 pointer)
|
||||||
|
{
|
||||||
|
// 1. If pointer is greater than 39419 and less than 189000, or pointer is greater than 1237575, return null.
|
||||||
|
if ((pointer > 39419 && pointer < 189000) || pointer > 1237575)
|
||||||
|
return {};
|
||||||
|
|
||||||
|
// 2. If pointer is 7457, return code point U+E7C7.
|
||||||
|
if (pointer == 7457)
|
||||||
|
return 0xE7C7;
|
||||||
|
|
||||||
|
// FIXME: Encoding specification is not updated to GB-18030-2022 yet (https://github.com/whatwg/encoding/issues/312)
|
||||||
|
// NOTE: This matches https://commits.webkit.org/266173@main
|
||||||
|
switch (pointer) {
|
||||||
|
case 19057:
|
||||||
|
return 0xE81E; // 82 35 90 37
|
||||||
|
case 19058:
|
||||||
|
return 0xE826; // 82 35 90 38
|
||||||
|
case 19059:
|
||||||
|
return 0xE82B; // 82 35 90 39
|
||||||
|
case 19060:
|
||||||
|
return 0xE82C; // 82 35 91 30
|
||||||
|
case 19061:
|
||||||
|
return 0xE832; // 82 35 91 31
|
||||||
|
case 19062:
|
||||||
|
return 0xE843; // 82 35 91 32
|
||||||
|
case 19063:
|
||||||
|
return 0xE854; // 82 35 91 33
|
||||||
|
case 19064:
|
||||||
|
return 0xE864; // 82 35 91 34
|
||||||
|
case 39076:
|
||||||
|
return 0xE78D; // 84 31 82 36
|
||||||
|
case 39077:
|
||||||
|
return 0xE78F; // 84 31 82 37
|
||||||
|
case 39078:
|
||||||
|
return 0xE78E; // 84 31 82 38
|
||||||
|
case 39079:
|
||||||
|
return 0xE790; // 84 31 82 39
|
||||||
|
case 39080:
|
||||||
|
return 0xE791; // 84 31 83 30
|
||||||
|
case 39081:
|
||||||
|
return 0xE792; // 84 31 83 31
|
||||||
|
case 39082:
|
||||||
|
return 0xE793; // 84 31 83 32
|
||||||
|
case 39083:
|
||||||
|
return 0xE794; // 84 31 83 33
|
||||||
|
case 39084:
|
||||||
|
return 0xE795; // 84 31 83 34
|
||||||
|
case 39085:
|
||||||
|
return 0xE796; // 84 31 83 35
|
||||||
|
default:
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 3. Let offset be the last pointer in index gb18030 ranges that is less than or equal to pointer and let code point offset be its corresponding code point.
|
||||||
|
size_t last_index;
|
||||||
|
binary_search(s_gb18030_ranges, pointer, &last_index, [](auto const pointer, auto const& entry) {
|
||||||
|
return pointer - entry.pointer;
|
||||||
|
});
|
||||||
|
auto offset = s_gb18030_ranges[last_index].pointer;
|
||||||
|
auto code_point_offset = s_gb18030_ranges[last_index].code_point;
|
||||||
|
|
||||||
|
// 4. Return a code point whose value is code point offset + pointer − offset.
|
||||||
|
return code_point_offset + pointer - offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
// https://encoding.spec.whatwg.org/#gb18030-decoder
|
||||||
|
ErrorOr<void> GB18030Decoder::process(StringView input, Function<ErrorOr<void>(u32)> on_code_point)
|
||||||
|
{
|
||||||
|
// gb18030’s decoder has an associated gb18030 first, gb18030 second, and gb18030 third (all initially 0x00).
|
||||||
|
u8 first = 0x00;
|
||||||
|
u8 second = 0x00;
|
||||||
|
u8 third = 0x00;
|
||||||
|
|
||||||
|
// gb18030’s decoder’s handler, given ioQueue and byte, runs these steps:
|
||||||
|
size_t index = 0;
|
||||||
|
while (true) {
|
||||||
|
// 1. If byte is end-of-queue and gb18030 first, gb18030 second, and gb18030 third are 0x00, return finished.
|
||||||
|
if (index >= input.length() && first == 0x00 && second == 0x00 && third == 0x00)
|
||||||
|
return {};
|
||||||
|
|
||||||
|
// 2. If byte is end-of-queue, and gb18030 first, gb18030 second, or gb18030 third is not 0x00, set gb18030 first, gb18030 second, and gb18030 third to 0x00, and return error.
|
||||||
|
if (index >= input.length() && (first != 0x00 || second != 0x00 || third != 0x00)) {
|
||||||
|
first = 0x00;
|
||||||
|
second = 0x00;
|
||||||
|
third = 0x00;
|
||||||
|
TRY(on_code_point(replacement_code_point));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
u8 const byte = input[index++];
|
||||||
|
// 3. If gb18030 third is not 0x00, then:
|
||||||
|
if (third != 0x00) {
|
||||||
|
// 1. If byte is not in the range 0x30 to 0x39, inclusive, then:
|
||||||
|
if (byte < 0x30 || byte > 0x39) {
|
||||||
|
// 1. Restore « gb18030 second, gb18030 third, byte » to ioQueue.
|
||||||
|
index -= 3;
|
||||||
|
|
||||||
|
// 2. Set gb18030 first, gb18030 second, and gb18030 third to 0x00.
|
||||||
|
first = 0x00;
|
||||||
|
second = 0x00;
|
||||||
|
third = 0x00;
|
||||||
|
|
||||||
|
// 3. Return error.
|
||||||
|
TRY(on_code_point(replacement_code_point));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Let code point be the index gb18030 ranges code point for ((gb18030 first − 0x81) × (10 × 126 × 10)) + ((gb18030 second − 0x30) × (10 × 126)) + ((gb18030 third − 0x81) × 10) + byte − 0x30.
|
||||||
|
auto code_point = index_gb18030_ranges_code_point(((first - 0x81) * (10 * 126 * 10)) + ((second - 0x30) * (10 * 126)) + ((third - 0x81) * 10) + byte - 0x30);
|
||||||
|
|
||||||
|
// 3. Set gb18030 first, gb18030 second, and gb18030 third to 0x00.
|
||||||
|
first = 0x00;
|
||||||
|
second = 0x00;
|
||||||
|
third = 0x00;
|
||||||
|
|
||||||
|
// 4. If code point is null, return error.
|
||||||
|
if (!code_point.has_value()) {
|
||||||
|
TRY(on_code_point(replacement_code_point));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. Return a code point whose value is code point.
|
||||||
|
TRY(on_code_point(code_point.value()));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 4. If gb18030 second is not 0x00, then:
|
||||||
|
if (second != 0x00) {
|
||||||
|
// 1. If byte is in the range 0x81 to 0xFE, inclusive, set gb18030 third to byte and return continue.
|
||||||
|
if (byte >= 0x81 && byte <= 0xFE) {
|
||||||
|
third = byte;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Restore « gb18030 second, byte » to ioQueue, set gb18030 first and gb18030 second to 0x00, and return error.
|
||||||
|
index -= 2;
|
||||||
|
first = 0x00;
|
||||||
|
second = 0x00;
|
||||||
|
TRY(on_code_point(replacement_code_point));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 5. If gb18030 first is not 0x00, then:
|
||||||
|
if (first != 0x00) {
|
||||||
|
// 1. If byte is in the range 0x30 to 0x39, inclusive, set gb18030 second to byte and return continue.
|
||||||
|
if (byte >= 0x30 && byte <= 0x39) {
|
||||||
|
second = byte;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 2. Let lead be gb18030 first, let pointer be null, and set gb18030 first to 0x00.
|
||||||
|
auto lead = first;
|
||||||
|
Optional<u32> pointer;
|
||||||
|
first = 0x00;
|
||||||
|
|
||||||
|
// 3. Let offset be 0x40 if byte is less than 0x7F, otherwise 0x41.
|
||||||
|
u8 const offset = byte < 0x7F ? 0x40 : 0x41;
|
||||||
|
|
||||||
|
// 4. If byte is in the range 0x40 to 0x7E, inclusive, or 0x80 to 0xFE, inclusive, set pointer to (lead − 0x81) × 190 + (byte − offset).
|
||||||
|
if ((byte >= 0x40 && byte <= 0x7E) || (byte >= 0x80 && byte <= 0xFE))
|
||||||
|
pointer = (lead - 0x81) * 190 + (byte - offset);
|
||||||
|
|
||||||
|
// 5. Let code point be null if pointer is null, otherwise the index code point for pointer in index gb18030.
|
||||||
|
auto code_point = pointer.has_value() ? index_gb18030_code_point(pointer.value()) : Optional<u32> {};
|
||||||
|
|
||||||
|
// 6. If code point is non-null, return a code point whose value is code point.
|
||||||
|
if (code_point.has_value()) {
|
||||||
|
TRY(on_code_point(code_point.value()));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 7. If byte is an ASCII byte, restore byte to ioQueue.
|
||||||
|
if (byte <= 0x7F)
|
||||||
|
index--;
|
||||||
|
|
||||||
|
// 8. Return error.
|
||||||
|
TRY(on_code_point(replacement_code_point));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 6. If byte is an ASCII byte, return a code point whose value is byte.
|
||||||
|
if (byte <= 0x7F) {
|
||||||
|
TRY(on_code_point(byte));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 7. If byte is 0x80, return code point U+20AC.
|
||||||
|
if (byte == 0x80) {
|
||||||
|
TRY(on_code_point(0x20AC));
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 8. If byte is in the range 0x81 to 0xFE, inclusive, set gb18030 first to byte and return continue.
|
||||||
|
if (byte >= 0x81 && byte <= 0xFE) {
|
||||||
|
first = byte;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 9. Return error.
|
||||||
|
TRY(on_code_point(replacement_code_point));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -95,6 +95,11 @@ public:
|
||||||
virtual bool validate(StringView) override { return true; }
|
virtual bool validate(StringView) override { return true; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class GB18030Decoder final : public Decoder {
|
||||||
|
public:
|
||||||
|
virtual ErrorOr<void> process(StringView, Function<ErrorOr<void>(u32)> on_code_point) override;
|
||||||
|
};
|
||||||
|
|
||||||
Optional<Decoder&> decoder_for(StringView encoding);
|
Optional<Decoder&> decoder_for(StringView encoding);
|
||||||
Optional<StringView> get_standardized_encoding(StringView encoding);
|
Optional<StringView> get_standardized_encoding(StringView encoding);
|
||||||
|
|
||||||
|
|
36
Userland/Libraries/LibTextCodec/indexes.json
Normal file
36
Userland/Libraries/LibTextCodec/indexes.json
Normal file
File diff suppressed because one or more lines are too long
Loading…
Add table
Add a link
Reference in a new issue