diff --git a/Meta/CMake/libtextcodec_generators.cmake b/Meta/CMake/libtextcodec_generators.cmake index 61918579817..e8d716bd71b 100644 --- a/Meta/CMake/libtextcodec_generators.cmake +++ b/Meta/CMake/libtextcodec_generators.cmake @@ -1,17 +1,17 @@ -function (generate_encoding_indexes) +function(generate_encoding_indexes) set(LIBTEXTCODEC_INPUT_FOLDER "${CMAKE_CURRENT_SOURCE_DIR}") # indexes.json can be found at https://encoding.spec.whatwg.org/indexes.json - invoke_generator( - "LookupTables.cpp" - Lagom::GenerateEncodingIndexes - "${LIBTEXTCODEC_INPUT_FOLDER}/indexes.json" - "LookupTables.h" - "LookupTables.cpp" - arguments -j "${LIBTEXTCODEC_INPUT_FOLDER}/indexes.json" + invoke_py_generator( + "LookupTables.cpp" + "generate_encoding_indexes.py" + "${LIBTEXTCODEC_INPUT_FOLDER}/indexes.json" + "LookupTables.h" + "LookupTables.cpp" + arguments -j "${LIBTEXTCODEC_INPUT_FOLDER}/indexes.json" ) - if (ENABLE_INSTALL_HEADERS) + if(ENABLE_INSTALL_HEADERS) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/LookupTables.h" DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/LibTextCodec/") endif() endfunction() diff --git a/Meta/Lagom/Tools/CodeGenerators/CMakeLists.txt b/Meta/Lagom/Tools/CodeGenerators/CMakeLists.txt index 5ad6e3858bf..fbfe2a7e739 100644 --- a/Meta/Lagom/Tools/CodeGenerators/CMakeLists.txt +++ b/Meta/Lagom/Tools/CodeGenerators/CMakeLists.txt @@ -1,4 +1,3 @@ add_subdirectory(IPCCompiler) -add_subdirectory(LibTextCodec) add_subdirectory(LibURL) add_subdirectory(LibWeb) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/CMakeLists.txt b/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/CMakeLists.txt deleted file mode 100644 index 32d9aced253..00000000000 --- a/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/CMakeLists.txt +++ /dev/null @@ -1 +0,0 @@ -lagom_tool(GenerateEncodingIndexes SOURCES GenerateEncodingIndexes.cpp LIBS LibMain) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp b/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp deleted file mode 100644 index 69adaeaf246..00000000000 --- a/Meta/Lagom/Tools/CodeGenerators/LibTextCodec/GenerateEncodingIndexes.cpp +++ /dev/null @@ -1,314 +0,0 @@ -/* - * Copyright (c) 2024, Simon Wanner - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace { - -struct LookupTable { - u32 first_pointer; - u32 max_code_point; - Vector code_points; - bool generate_accessor; - bool generate_inverse_accessor; -}; - -struct LookupTables { - JsonArray const& gb18030_ranges; - OrderedHashMap indexes; -}; - -enum class GenerateAccessor { - No, - Yes, -}; - -enum class GenerateInverseAccessor { - No, - Yes, -}; - -LookupTable prepare_table(JsonArray const& data, GenerateAccessor generate_accessor = GenerateAccessor::No, GenerateInverseAccessor generate_inverse_accessor = GenerateInverseAccessor::No) -{ - Vector code_points; - code_points.ensure_capacity(data.size()); - u32 max = 0; - u32 first_pointer = 0; - for (auto const& entry : data.values()) { - if (entry.is_null()) { - if (code_points.is_empty()) { - first_pointer++; - } else { - code_points.append(0xfffd); - max = AK::max(max, code_points.last()); - } - } else { - code_points.append(entry.as_integer()); - max = AK::max(max, code_points.last()); - } - } - if (generate_accessor == GenerateAccessor::Yes) { - while (code_points.last() == 0xfffd) - code_points.take_last(); - } else { - VERIFY(first_pointer == 0); - } - return { first_pointer, max, move(code_points), generate_accessor == GenerateAccessor::Yes, generate_inverse_accessor == GenerateInverseAccessor::Yes }; -} - -void generate_table(SourceGenerator generator, StringView name, LookupTable& table) -{ - generator.set("name", name); - generator.set("value_type", table.max_code_point > NumericLimits::max() ? "u32" : "u16"); - generator.set("first_pointer", String::number(table.first_pointer)); - generator.set("size", String::number(table.code_points.size())); - - if (table.first_pointer > 0) { - generator.appendln("static constexpr u32 s_@name@_index_first_pointer = @first_pointer@;"); - } - - generator.append("static constexpr Array<@value_type@, @size@> s_@name@_index {\n "); - for (size_t i = 0; i < table.code_points.size(); i++) { - generator.append(MUST(String::formatted("{:#04x}", table.code_points[i]))); - if (i != table.code_points.size() - 1) - generator.append(i % 16 == 15 ? ",\n "sv : ", "sv); - } - generator.appendln("\n};"); - if (table.generate_accessor) - generator.appendln("Optional index_@name@_code_point(u32 pointer);"); - if (table.generate_inverse_accessor) - generator.appendln("Optional code_point_@name@_index(u32 code_point);"); -} - -ErrorOr generate_header_file(LookupTables& tables, Core::File& file) -{ - StringBuilder builder; - SourceGenerator generator { builder }; - - generator.set("gb18030_ranges_size", String::number(tables.gb18030_ranges.size())); - - generator.append(R"~~~( -#pragma once - -#include -#include - -namespace TextCodec { - -struct Gb18030RangeEntry { - u32 pointer; - u32 code_point; -}; - -static constexpr Array s_gb18030_ranges { { -)~~~"); - - for (auto const& range : tables.gb18030_ranges.values()) { - generator.appendln(MUST(String::formatted(" {{ {}, {:#04x} }},", range.as_array()[0].as_integer(), range.as_array()[1].as_integer()))); - } - generator.appendln("} };\n"); - - for (auto e : tables.indexes) { - generate_table(generator.fork(), e.key, e.value); - } - - generator.append("\n"); - generator.appendln("}"); - - TRY(file.write_until_depleted(generator.as_string_view().bytes())); - return {}; -} - -void generate_table_accessor(SourceGenerator generator, StringView name, LookupTable& table) -{ - generator.set("name", name); - generator.set("first_pointer", String::number(table.first_pointer)); - generator.set("size", String::number(table.code_points.size())); - - if (table.first_pointer > 0) { - generator.append(R"~~~( -Optional index_@name@_code_point(u32 pointer) -{ - if (pointer < s_@name@_index_first_pointer || pointer - s_@name@_index_first_pointer >= s_@name@_index.size()) - return {}; - auto value = s_@name@_index[pointer - s_@name@_index_first_pointer]; - if (value == 0xfffd) - return {}; - return value; -} -)~~~"); - } else { - generator.append(R"~~~( -Optional index_@name@_code_point(u32 pointer) -{ - if (pointer >= s_@name@_index.size()) - return {}; - auto value = s_@name@_index[pointer]; - if (value == 0xfffd) - return {}; - return value; -} -)~~~"); - } -} - -void generate_inverse_table_accessor(SourceGenerator generator, StringView name, LookupTable& table) -{ - generator.set("name", name); - generator.set("first_pointer", String::number(table.first_pointer)); - generator.set("size", String::number(table.code_points.size())); - - // FIXME - Doing a linear search here is really slow, should be generating - // some kind of reverse lookup table. - - if (table.first_pointer > 0) { - generator.append(R"~~~( -Optional code_point_@name@_index(u32 code_point) -{ - for (u32 i = 0; i < s_@name@_index.size(); ++i) { - if (s_@name@_index[i] == code_point) { - return s_@name@_index_first_pointer + i; - } - } - return {}; -} -)~~~"); - } else { - generator.append(R"~~~( -Optional code_point_@name@_index(u32 code_point) -{ - for (u32 i = 0; i < s_@name@_index.size(); ++i) { - if (s_@name@_index[i] == code_point) { - return i; - } - } - return {}; -} -)~~~"); - } -} - -ErrorOr generate_implementation_file(LookupTables& tables, Core::File& file) -{ - StringBuilder builder; - SourceGenerator generator { builder }; - - generator.append(R"~~~( -#include - -namespace TextCodec { -)~~~"); - - for (auto& [key, table] : tables.indexes) { - if (table.generate_accessor) - generate_table_accessor(generator.fork(), key, table); - if (table.generate_inverse_accessor) - generate_inverse_table_accessor(generator.fork(), key, table); - } - - generator.appendln("\n}"); - - TRY(file.write_until_depleted(generator.as_string_view().bytes())); - return {}; -} - -} // end anonymous namespace - -ErrorOr serenity_main(Main::Arguments arguments) -{ - StringView generated_header_path; - StringView generated_implementation_path; - StringView json_path; - - Core::ArgsParser args_parser; - args_parser.add_option(generated_header_path, "Path to the lookup table header file to generate", "generated-header-path", 'h', "generated-header-path"); - args_parser.add_option(generated_implementation_path, "Path to the lookup table implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path"); - args_parser.add_option(json_path, "Path to the JSON file to read from", "json-path", 'j', "json-path"); - args_parser.parse(arguments); - - auto json_file = TRY(Core::File::open(json_path, Core::File::OpenMode::Read)); - auto json_data = TRY(json_file->read_until_eof()); - auto data = TRY(JsonValue::from_string(json_data)).as_object(); - - auto gb18030_table = prepare_table(data.get("gb18030"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes); - - // FIXME: Encoding specification is not updated to GB-18030-2022 yet (https://github.com/whatwg/encoding/issues/312) - // NOTE: See https://commits.webkit.org/264918@main - gb18030_table.code_points[7182] = 0xfe10; - gb18030_table.code_points[7183] = 0xfe12; - gb18030_table.code_points[7184] = 0xfe11; - gb18030_table.code_points[7185] = 0xfe13; - gb18030_table.code_points[7186] = 0xfe14; - gb18030_table.code_points[7187] = 0xfe15; - gb18030_table.code_points[7188] = 0xfe16; - gb18030_table.code_points[7201] = 0xfe17; - gb18030_table.code_points[7202] = 0xfe18; - gb18030_table.code_points[7208] = 0xfe19; - gb18030_table.code_points[23775] = 0x9fb4; - gb18030_table.code_points[23783] = 0x9fb5; - gb18030_table.code_points[23788] = 0x9fb6; - gb18030_table.code_points[23789] = 0x9fb7; - gb18030_table.code_points[23795] = 0x9fb8; - gb18030_table.code_points[23812] = 0x9fb9; - gb18030_table.code_points[23829] = 0x9fba; - gb18030_table.code_points[23845] = 0x9fbb; - - LookupTables tables { - .gb18030_ranges = data.get("gb18030-ranges"sv)->as_array(), - .indexes = { - { "gb18030"sv, move(gb18030_table) }, - { "big5"sv, prepare_table(data.get("big5"sv)->as_array(), GenerateAccessor::Yes) }, - { "jis0208"sv, prepare_table(data.get("jis0208"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes) }, - { "jis0212"sv, prepare_table(data.get("jis0212"sv)->as_array(), GenerateAccessor::Yes) }, - { "euc_kr"sv, prepare_table(data.get("euc-kr"sv)->as_array(), GenerateAccessor::Yes, GenerateInverseAccessor::Yes) }, - { "ibm866"sv, prepare_table(data.get("ibm866"sv)->as_array()) }, - { "iso_2022_jp_katakana"sv, prepare_table(data.get("iso-2022-jp-katakana"sv)->as_array(), GenerateAccessor::Yes) }, - { "iso_8859_2"sv, prepare_table(data.get("iso-8859-2"sv)->as_array()) }, - { "iso_8859_3"sv, prepare_table(data.get("iso-8859-3"sv)->as_array()) }, - { "iso_8859_4"sv, prepare_table(data.get("iso-8859-4"sv)->as_array()) }, - { "iso_8859_5"sv, prepare_table(data.get("iso-8859-5"sv)->as_array()) }, - { "iso_8859_6"sv, prepare_table(data.get("iso-8859-6"sv)->as_array()) }, - { "iso_8859_7"sv, prepare_table(data.get("iso-8859-7"sv)->as_array()) }, - { "iso_8859_8"sv, prepare_table(data.get("iso-8859-8"sv)->as_array()) }, - { "iso_8859_10"sv, prepare_table(data.get("iso-8859-10"sv)->as_array()) }, - { "iso_8859_13"sv, prepare_table(data.get("iso-8859-13"sv)->as_array()) }, - { "iso_8859_14"sv, prepare_table(data.get("iso-8859-14"sv)->as_array()) }, - { "iso_8859_15"sv, prepare_table(data.get("iso-8859-15"sv)->as_array()) }, - { "iso_8859_16"sv, prepare_table(data.get("iso-8859-16"sv)->as_array()) }, - { "koi8_r"sv, prepare_table(data.get("koi8-r"sv)->as_array()) }, - { "koi8_u"sv, prepare_table(data.get("koi8-u"sv)->as_array()) }, - { "macintosh"sv, prepare_table(data.get("macintosh"sv)->as_array()) }, - { "windows_874"sv, prepare_table(data.get("windows-874"sv)->as_array()) }, - { "windows_1250"sv, prepare_table(data.get("windows-1250"sv)->as_array()) }, - { "windows_1251"sv, prepare_table(data.get("windows-1251"sv)->as_array()) }, - { "windows_1252"sv, prepare_table(data.get("windows-1252"sv)->as_array()) }, - { "windows_1253"sv, prepare_table(data.get("windows-1253"sv)->as_array()) }, - { "windows_1254"sv, prepare_table(data.get("windows-1254"sv)->as_array()) }, - { "windows_1255"sv, prepare_table(data.get("windows-1255"sv)->as_array()) }, - { "windows_1256"sv, prepare_table(data.get("windows-1256"sv)->as_array()) }, - { "windows_1257"sv, prepare_table(data.get("windows-1257"sv)->as_array()) }, - { "windows_1258"sv, prepare_table(data.get("windows-1258"sv)->as_array()) }, - { "x_mac_cyrillic"sv, prepare_table(data.get("x-mac-cyrillic"sv)->as_array()) }, - }, - }; - - auto generated_header_file = TRY(Core::File::open(generated_header_path, Core::File::OpenMode::Write)); - auto generated_implementation_file = TRY(Core::File::open(generated_implementation_path, Core::File::OpenMode::Write)); - - TRY(generate_header_file(tables, *generated_header_file)); - TRY(generate_implementation_file(tables, *generated_implementation_file)); - - return 0; -} diff --git a/Meta/generate_encoding_indexes.py b/Meta/generate_encoding_indexes.py new file mode 100644 index 00000000000..244c7a392ba --- /dev/null +++ b/Meta/generate_encoding_indexes.py @@ -0,0 +1,300 @@ +#!/usr/bin/env python3 + +# Copyright (c) 2024, Simon Wanner +# Copyright (c) 2025, ayeteadoe +# +# SPDX-License-Identifier: BSD-2-Clause + +import argparse +import json + +from dataclasses import dataclass +from enum import Enum +from pathlib import Path +from typing import Any + + +class GenerateAccessor(Enum): + NO = False + YES = True + + +class GenerateInverseAccessor(Enum): + NO = False + YES = True + + +@dataclass +class LookupTable: + first_pointer: int + max_code_point: int + code_points: list[int] + generate_accessor: GenerateAccessor + generate_inverse_accessor: GenerateInverseAccessor + + +@dataclass +class LookupTables: + gb18030_ranges: list[Any] + indexes: dict[str, LookupTable] + + +def prepare_table( + data: list[Any], + generate_accessor: GenerateAccessor = GenerateAccessor.NO, +) -> LookupTable: + code_points = [] + max_code_point = 0 + first_pointer = 0 + + for entry in data: + if entry is None: + if not code_points: + first_pointer += 1 + else: + code_points.append(0xFFFD) + max_code_point = max(max_code_point, code_points[-1]) + else: + code_points.append(int(entry)) + max_code_point = max(max_code_point, code_points[-1]) + + if generate_accessor == GenerateAccessor.YES: + while code_points and code_points[-1] == 0xFFFD: + code_points.pop() + else: + assert first_pointer == 0 + + return LookupTable( + first_pointer=first_pointer, + max_code_point=max_code_point, + code_points=code_points, + generate_accessor=GenerateAccessor.YES, + generate_inverse_accessor=GenerateInverseAccessor.YES, + ) + + +def generate_table(name: str, table: LookupTable) -> str: + max_u16 = (1 << 16) - 1 + value_type = "u32" if table.max_code_point > max_u16 else "u16" + size = len(table.code_points) + + lines = [] + + if table.first_pointer > 0: + lines.append(f"static constexpr u32 s_{name}_index_first_pointer = {table.first_pointer};") + + lines.append(f"static constexpr Array<{value_type}, {size}> s_{name}_index {{") + + formatted_points = [] + for i, point in enumerate(table.code_points): + formatted_points.append(f"0x{point:04x}") + if i != len(table.code_points) - 1: + if i % 16 == 15: + formatted_points.append(",\n ") + else: + formatted_points.append(", ") + + lines.append(f" {' '.join(formatted_points)}") + lines.append("};") + + if table.generate_accessor: + lines.append(f"Optional index_{name}_code_point(u32 pointer);") + + if table.generate_inverse_accessor: + lines.append(f"Optional code_point_{name}_index(u32 code_point);") + + return "\n".join(lines) + + +def generate_header_file(tables: LookupTables, output_path: Path) -> None: + gb18030_ranges_size = len(tables.gb18030_ranges) + + content = f"""#pragma once + +#include +#include + +namespace TextCodec {{ + +struct Gb18030RangeEntry {{ + u32 pointer; + u32 code_point; +}}; + +static constexpr Array s_gb18030_ranges {{ {{ +""" + + for range_entry in tables.gb18030_ranges: + pointer = range_entry[0] + code_point = range_entry[1] + content += f" {{ {pointer}, 0x{code_point:04x} }},\n" + + content += "} };\n\n" + + for name, table in tables.indexes.items(): + content += generate_table(name, table) + "\n\n" + + content += "}\n" + + with open(output_path, "w") as f: + f.write(content) + + +def generate_table_accessor(name: str, table: LookupTable) -> str: + if table.first_pointer > 0: + return f""" +Optional index_{name}_code_point(u32 pointer) +{{ + if (pointer < s_{name}_index_first_pointer || pointer - s_{name}_index_first_pointer >= s_{name}_index.size()) + return {{}}; + auto value = s_{name}_index[pointer - s_{name}_index_first_pointer]; + if (value == 0xfffd) + return {{}}; + return value; +}} +""" + else: + return f""" +Optional index_{name}_code_point(u32 pointer) +{{ + if (pointer >= s_{name}_index.size()) + return {{}}; + auto value = s_{name}_index[pointer]; + if (value == 0xfffd) + return {{}}; + return value; +}} +""" + + +def generate_inverse_table_accessor(name: str, table: LookupTable) -> str: + if table.first_pointer > 0: + return f""" +Optional code_point_{name}_index(u32 code_point) +{{ + for (u32 i = 0; i < s_{name}_index.size(); ++i) {{ + if (s_{name}_index[i] == code_point) {{ + return s_{name}_index_first_pointer + i; + }} + }} + return {{}}; +}} +""" + else: + return f""" +Optional code_point_{name}_index(u32 code_point) +{{ + for (u32 i = 0; i < s_{name}_index.size(); ++i) {{ + if (s_{name}_index[i] == code_point) {{ + return i; + }} + }} + return {{}}; +}} +""" + + +def generate_implementation_file(tables: LookupTables, output_path: Path) -> None: + content = """ +#include + +namespace TextCodec { +""" + + for name, table in tables.indexes.items(): + if table.generate_accessor: + content += generate_table_accessor(name, table) + if table.generate_inverse_accessor: + content += generate_inverse_table_accessor(name, table) + + content += "\n}\n" + + with open(output_path, "w") as f: + f.write(content) + + +def main(): + parser = argparse.ArgumentParser(description="Generate text codec lookup tables", add_help=False) + parser.add_argument("-h", required=True) + parser.add_argument("-c", required=True) + parser.add_argument("-j", required=True) + + args = parser.parse_args() + + with open(args.j, "r") as f: + data = json.load(f) + + gb18030_table = prepare_table(data["gb18030"], GenerateAccessor.YES) + + # FIXME: Update JSON to match GB-18030-2022 Encoding specification (https://github.com/whatwg/encoding/issues/312) + # NOTE: See https://commits.webkit.org/264918@main + gb18030_updates = { + 7182: 0xFE10, + 7183: 0xFE12, + 7184: 0xFE11, + 7185: 0xFE13, + 7186: 0xFE14, + 7187: 0xFE15, + 7188: 0xFE16, + 7201: 0xFE17, + 7202: 0xFE18, + 7208: 0xFE19, + 23775: 0x9FB4, + 23783: 0x9FB5, + 23788: 0x9FB6, + 23789: 0x9FB7, + 23795: 0x9FB8, + 23812: 0x9FB9, + 23829: 0x9FBA, + 23845: 0x9FBB, + } + + for index, value in gb18030_updates.items(): + if index < len(gb18030_table.code_points): + gb18030_table.code_points[index] = value + + tables = LookupTables( + gb18030_ranges=data["gb18030-ranges"], + indexes={ + "gb18030": gb18030_table, + "big5": prepare_table(data["big5"], GenerateAccessor.YES), + "jis0208": prepare_table(data["jis0208"], GenerateAccessor.YES), + "jis0212": prepare_table(data["jis0212"], GenerateAccessor.YES), + "euc_kr": prepare_table(data["euc-kr"], GenerateAccessor.YES), + "ibm866": prepare_table(data["ibm866"]), + "iso_2022_jp_katakana": prepare_table(data["iso-2022-jp-katakana"], GenerateAccessor.YES), + "iso_8859_2": prepare_table(data["iso-8859-2"]), + "iso_8859_3": prepare_table(data["iso-8859-3"]), + "iso_8859_4": prepare_table(data["iso-8859-4"]), + "iso_8859_5": prepare_table(data["iso-8859-5"]), + "iso_8859_6": prepare_table(data["iso-8859-6"]), + "iso_8859_7": prepare_table(data["iso-8859-7"]), + "iso_8859_8": prepare_table(data["iso-8859-8"]), + "iso_8859_10": prepare_table(data["iso-8859-10"]), + "iso_8859_13": prepare_table(data["iso-8859-13"]), + "iso_8859_14": prepare_table(data["iso-8859-14"]), + "iso_8859_15": prepare_table(data["iso-8859-15"]), + "iso_8859_16": prepare_table(data["iso-8859-16"]), + "koi8_r": prepare_table(data["koi8-r"]), + "koi8_u": prepare_table(data["koi8-u"]), + "macintosh": prepare_table(data["macintosh"]), + "windows_874": prepare_table(data["windows-874"]), + "windows_1250": prepare_table(data["windows-1250"]), + "windows_1251": prepare_table(data["windows-1251"]), + "windows_1252": prepare_table(data["windows-1252"]), + "windows_1253": prepare_table(data["windows-1253"]), + "windows_1254": prepare_table(data["windows-1254"]), + "windows_1255": prepare_table(data["windows-1255"]), + "windows_1256": prepare_table(data["windows-1256"]), + "windows_1257": prepare_table(data["windows-1257"]), + "windows_1258": prepare_table(data["windows-1258"]), + "x_mac_cyrillic": prepare_table(data["x-mac-cyrillic"]), + }, + ) + + generate_header_file(tables, Path(args.h)) + generate_implementation_file(tables, Path(args.c)) + + +if __name__ == "__main__": + main()