mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-04-20 19:45:12 +00:00
LibUnicode: Download and parse IDNA data
This commit is contained in:
parent
cfd0a60863
commit
7d9fe44039
Notes:
sideshowbarker
2024-07-16 23:08:48 +09:00
Author: https://github.com/skyrising Commit: https://github.com/SerenityOS/serenity/commit/7d9fe44039 Pull-request: https://github.com/SerenityOS/serenity/pull/19414 Reviewed-by: https://github.com/AtkinsSJ Reviewed-by: https://github.com/nico Reviewed-by: https://github.com/shannonbooth Reviewed-by: https://github.com/trflynn89
6 changed files with 324 additions and 30 deletions
|
@ -68,6 +68,9 @@ set(EMOJI_RES_PATH "${SerenityOS_SOURCE_DIR}/Base/res/emoji")
|
|||
set(EMOJI_SERENITY_PATH "${SerenityOS_SOURCE_DIR}/Base/home/anon/Documents/emoji-serenity.txt")
|
||||
set(EMOJI_INSTALL_PATH "${CMAKE_BINARY_DIR}/Root/home/anon/Documents/emoji.txt")
|
||||
|
||||
set(IDNA_MAPPING_TABLE_URL "https://www.unicode.org/Public/idna/${UCD_VERSION}/IdnaMappingTable.txt")
|
||||
set(IDNA_MAPPING_TABLE_PATH "${UCD_PATH}/IdnaMappingTable.txt")
|
||||
|
||||
if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
||||
remove_path_if_version_changed("${UCD_VERSION}" "${UCD_VERSION_FILE}" "${UCD_PATH}")
|
||||
|
||||
|
@ -98,12 +101,17 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
|||
message(STATUS "Skipping download of ${EMOJI_TEST_URL}, expecting the archive to have been extracted to ${EMOJI_TEST_PATH}")
|
||||
endif()
|
||||
|
||||
download_file("${IDNA_MAPPING_TABLE_URL}" "${IDNA_MAPPING_TABLE_PATH}")
|
||||
|
||||
set(UNICODE_DATA_HEADER UnicodeData.h)
|
||||
set(UNICODE_DATA_IMPLEMENTATION UnicodeData.cpp)
|
||||
|
||||
set(EMOJI_DATA_HEADER EmojiData.h)
|
||||
set(EMOJI_DATA_IMPLEMENTATION EmojiData.cpp)
|
||||
|
||||
set(IDNA_DATA_HEADER IDNAData.h)
|
||||
set(IDNA_DATA_IMPLEMENTATION IDNAData.cpp)
|
||||
|
||||
if (SERENITYOS)
|
||||
set(EMOJI_INSTALL_ARG -i "${EMOJI_INSTALL_PATH}")
|
||||
endif()
|
||||
|
@ -130,11 +138,21 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD)
|
|||
# the generated emoji.txt file.
|
||||
dependencies "${EMOJI_RES_PATH}" "${EMOJI_SERENITY_PATH}"
|
||||
)
|
||||
invoke_generator(
|
||||
"IDNAData"
|
||||
Lagom::GenerateIDNAData
|
||||
"${UCD_VERSION_FILE}"
|
||||
"${IDNA_DATA_HEADER}"
|
||||
"${IDNA_DATA_IMPLEMENTATION}"
|
||||
arguments -m "${IDNA_MAPPING_TABLE_PATH}"
|
||||
)
|
||||
|
||||
set(UNICODE_DATA_SOURCES
|
||||
${UNICODE_DATA_HEADER}
|
||||
${UNICODE_DATA_IMPLEMENTATION}
|
||||
${EMOJI_DATA_HEADER}
|
||||
${EMOJI_DATA_IMPLEMENTATION}
|
||||
${IDNA_DATA_HEADER}
|
||||
${IDNA_DATA_IMPLEMENTATION}
|
||||
)
|
||||
endif()
|
||||
|
|
|
@ -1,2 +1,3 @@
|
|||
lagom_tool(GenerateUnicodeData SOURCES GenerateUnicodeData.cpp LIBS LibMain)
|
||||
lagom_tool(GenerateEmojiData SOURCES GenerateEmojiData.cpp LIBS LibMain)
|
||||
lagom_tool(GenerateIDNAData SOURCES GenerateIDNAData.cpp LIBS LibMain)
|
||||
|
|
240
Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateIDNAData.cpp
Normal file
240
Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateIDNAData.cpp
Normal file
|
@ -0,0 +1,240 @@
|
|||
/*
|
||||
* Copyright (c) 2023, Simon Wanner <simon@skyrising.xyz>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include "GeneratorUtil.h"
|
||||
#include <AK/Error.h>
|
||||
#include <AK/SourceGenerator.h>
|
||||
#include <AK/Types.h>
|
||||
#include <LibCore/ArgsParser.h>
|
||||
|
||||
enum class MappingStatus : u8 {
|
||||
Valid,
|
||||
Ignored,
|
||||
Mapped,
|
||||
Deviation,
|
||||
Disallowed,
|
||||
DisallowedStd3Valid,
|
||||
DisallowedStd3Mapped,
|
||||
};
|
||||
|
||||
static constexpr Array<StringView, 7> mapping_status_names { "Valid"sv, "Ignored"sv, "Mapped"sv, "Deviation"sv, "Disallowed"sv, "DisallowedStd3Valid"sv, "DisallowedStd3Mapped"sv };
|
||||
|
||||
enum class IDNA2008Status : u8 {
|
||||
NV8,
|
||||
XV8,
|
||||
};
|
||||
|
||||
static constexpr Array<StringView, 2> idna_2008_status_names { "NV8"sv, "XV8"sv };
|
||||
|
||||
struct IDNAMapping {
|
||||
Unicode::CodePointRange code_points;
|
||||
MappingStatus status;
|
||||
IDNA2008Status idna_2008_status;
|
||||
Vector<u32> mapped_to {};
|
||||
};
|
||||
|
||||
struct IDNAData {
|
||||
Vector<IDNAMapping> mapping_table;
|
||||
};
|
||||
|
||||
static MappingStatus parse_mapping_status(StringView status)
|
||||
{
|
||||
if (status == "valid"sv)
|
||||
return MappingStatus::Valid;
|
||||
if (status == "ignored"sv)
|
||||
return MappingStatus::Ignored;
|
||||
if (status == "mapped"sv)
|
||||
return MappingStatus::Mapped;
|
||||
if (status == "deviation"sv)
|
||||
return MappingStatus::Deviation;
|
||||
if (status == "disallowed"sv)
|
||||
return MappingStatus::Disallowed;
|
||||
if (status == "disallowed_STD3_valid"sv)
|
||||
return MappingStatus::DisallowedStd3Valid;
|
||||
if (status == "disallowed_STD3_mapped"sv)
|
||||
return MappingStatus::DisallowedStd3Mapped;
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
||||
static ErrorOr<void> parse_idna_mapping_table(Core::InputBufferedFile& file, Vector<IDNAMapping>& mapping_table)
|
||||
{
|
||||
Array<u8, 1024> buffer;
|
||||
|
||||
while (TRY(file.can_read_line())) {
|
||||
auto line = TRY(file.read_line(buffer));
|
||||
|
||||
if (line.is_empty() || line.starts_with('#'))
|
||||
continue;
|
||||
|
||||
if (auto index = line.find('#'); index.has_value())
|
||||
line = line.substring_view(0, *index);
|
||||
|
||||
auto segments = line.split_view(';', SplitBehavior::KeepEmpty);
|
||||
VERIFY(segments.size() >= 2);
|
||||
|
||||
IDNAMapping idna_mapping {};
|
||||
idna_mapping.code_points = parse_code_point_range(segments[0].trim_whitespace());
|
||||
idna_mapping.status = parse_mapping_status(segments[1].trim_whitespace());
|
||||
|
||||
if (segments.size() >= 3)
|
||||
idna_mapping.mapped_to = parse_code_point_list(segments[2].trim_whitespace());
|
||||
|
||||
if (segments.size() >= 4) {
|
||||
auto trimmed = segments[3].trim_whitespace();
|
||||
if (trimmed == "NV8"sv) {
|
||||
idna_mapping.idna_2008_status = IDNA2008Status::NV8;
|
||||
} else {
|
||||
VERIFY(trimmed == "XV8"sv);
|
||||
idna_mapping.idna_2008_status = IDNA2008Status::XV8;
|
||||
}
|
||||
}
|
||||
|
||||
TRY(mapping_table.try_append(move(idna_mapping)));
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
static ErrorOr<void> generate_idna_data_header(Core::InputBufferedFile& file, IDNAData&)
|
||||
{
|
||||
StringBuilder builder;
|
||||
SourceGenerator generator { builder };
|
||||
|
||||
generator.append(R"~~~(
|
||||
#pragma once
|
||||
|
||||
#include <AK/Optional.h>
|
||||
#include <LibUnicode/IDNA.h>
|
||||
|
||||
namespace Unicode::IDNA {
|
||||
|
||||
Optional<Mapping> get_idna_mapping(u32 code_point);
|
||||
|
||||
}
|
||||
)~~~");
|
||||
|
||||
TRY(file.write_until_depleted(generator.as_string_view().bytes()));
|
||||
return {};
|
||||
}
|
||||
|
||||
static ErrorOr<void> generate_idna_data_implementation(Core::InputBufferedFile& file, IDNAData& idna_data)
|
||||
{
|
||||
StringBuilder builder;
|
||||
SourceGenerator generator { builder };
|
||||
|
||||
generator.set("idna_table_size", TRY(String::number(idna_data.mapping_table.size())));
|
||||
|
||||
generator.append(R"~~~(
|
||||
|
||||
#include <AK/BinarySearch.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <LibUnicode/IDNAData.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
|
||||
namespace Unicode::IDNA {
|
||||
|
||||
struct MappingEntry {
|
||||
CodePointRange code_points {};
|
||||
MappingStatus status : 3 { MappingStatus::Valid };
|
||||
IDNA2008Status idna_2008_status : 1 { IDNA2008Status::NV8 };
|
||||
size_t mapping_offset : 20 { 0 };
|
||||
size_t mapping_length : 8 { 0 };
|
||||
};
|
||||
|
||||
static constexpr Array<MappingEntry, @idna_table_size@> s_idna_mapping_table { {)~~~");
|
||||
|
||||
{
|
||||
size_t mapping_offset = 0;
|
||||
for (auto const& mapping : idna_data.mapping_table) {
|
||||
generator.set("code_points", TRY(String::formatted("{:#x}, {:#x}", mapping.code_points.first, mapping.code_points.last)));
|
||||
generator.set("status", mapping_status_names[to_underlying(mapping.status)]);
|
||||
generator.set("idna_2008_status", idna_2008_status_names[to_underlying(mapping.idna_2008_status)]);
|
||||
|
||||
if (mapping.mapped_to.is_empty()) {
|
||||
generator.set("mapping_offset", "0"sv);
|
||||
generator.set("mapping_length", "0"sv);
|
||||
} else {
|
||||
generator.set("mapping_offset", TRY(String::number(mapping_offset)));
|
||||
generator.set("mapping_length", TRY(String::number(mapping.mapped_to.size())));
|
||||
mapping_offset += mapping.mapped_to.size();
|
||||
}
|
||||
|
||||
generator.append(R"~~~(
|
||||
{ { @code_points@ }, MappingStatus::@status@, IDNA2008Status::@idna_2008_status@, @mapping_offset@, @mapping_length@ },)~~~");
|
||||
}
|
||||
|
||||
generator.set("mapping_length_total", TRY(String::number(mapping_offset)));
|
||||
}
|
||||
|
||||
generator.append(R"~~~(
|
||||
} };
|
||||
|
||||
static constexpr Array<u32, @mapping_length_total@> s_mapping_code_points { )~~~");
|
||||
|
||||
{
|
||||
for (auto const& mapping : idna_data.mapping_table) {
|
||||
if (mapping.mapped_to.is_empty())
|
||||
continue;
|
||||
|
||||
for (u32 code_point : mapping.mapped_to)
|
||||
generator.append(TRY(String::formatted("{:#x}, ", code_point)));
|
||||
|
||||
generator.append(R"~~~(
|
||||
)~~~");
|
||||
}
|
||||
}
|
||||
|
||||
generator.append(R"~~~(
|
||||
};
|
||||
|
||||
Optional<Mapping> get_idna_mapping(u32 code_point)
|
||||
{
|
||||
auto* entry = binary_search(s_idna_mapping_table, code_point, nullptr, [](auto code_point, auto entry) {
|
||||
if (code_point < entry.code_points.first)
|
||||
return -1;
|
||||
if (code_point > entry.code_points.last)
|
||||
return 1;
|
||||
return 0;
|
||||
});
|
||||
|
||||
if (!entry)
|
||||
return {};
|
||||
|
||||
auto mapped_to = Utf32View { entry->mapping_length ? s_mapping_code_points.data() + entry->mapping_offset : nullptr, entry->mapping_length };
|
||||
return Mapping { entry->status, entry->idna_2008_status, move(mapped_to) };
|
||||
}
|
||||
|
||||
}
|
||||
)~~~");
|
||||
|
||||
TRY(file.write_until_depleted(generator.as_string_view().bytes()));
|
||||
return {};
|
||||
}
|
||||
|
||||
ErrorOr<int> serenity_main(Main::Arguments arguments)
|
||||
{
|
||||
StringView generated_header_path;
|
||||
StringView generated_implementation_path;
|
||||
StringView idna_mapping_table_path;
|
||||
|
||||
Core::ArgsParser args_parser;
|
||||
args_parser.add_option(generated_header_path, "Path to the IDNA Data header file to generate", "generated-header-path", 'h', "generated-header-path");
|
||||
args_parser.add_option(generated_implementation_path, "Path to the IDNA Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path");
|
||||
args_parser.add_option(idna_mapping_table_path, "Path to IdnaMappingTable.txt file", "idna-mapping-table-path", 'm', "idna-mapping-table-path");
|
||||
args_parser.parse(arguments);
|
||||
|
||||
auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write));
|
||||
auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write));
|
||||
auto idna_mapping_table_file = TRY(open_file(idna_mapping_table_path, Core::File::OpenMode::Read));
|
||||
|
||||
IDNAData idna_data {};
|
||||
TRY(parse_idna_mapping_table(*idna_mapping_table_file, idna_data.mapping_table));
|
||||
|
||||
TRY(generate_idna_data_header(*generated_header_file, idna_data));
|
||||
TRY(generate_idna_data_implementation(*generated_implementation_file, idna_data));
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -212,36 +212,6 @@ static DeprecatedString sanitize_entry(DeprecatedString const& entry)
|
|||
return builder.to_deprecated_string();
|
||||
}
|
||||
|
||||
static Vector<u32> parse_code_point_list(StringView list)
|
||||
{
|
||||
Vector<u32> code_points;
|
||||
|
||||
auto segments = list.split_view(' ');
|
||||
for (auto const& code_point : segments)
|
||||
code_points.append(AK::StringUtils::convert_to_uint_from_hex<u32>(code_point).value());
|
||||
|
||||
return code_points;
|
||||
}
|
||||
|
||||
static Unicode::CodePointRange parse_code_point_range(StringView list)
|
||||
{
|
||||
Unicode::CodePointRange code_point_range {};
|
||||
|
||||
if (list.contains(".."sv)) {
|
||||
auto segments = list.split_view(".."sv);
|
||||
VERIFY(segments.size() == 2);
|
||||
|
||||
auto begin = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
|
||||
auto end = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[1]).value();
|
||||
code_point_range = { begin, end };
|
||||
} else {
|
||||
auto code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(list).value();
|
||||
code_point_range = { code_point, code_point };
|
||||
}
|
||||
|
||||
return code_point_range;
|
||||
}
|
||||
|
||||
static ErrorOr<void> parse_special_casing(Core::InputBufferedFile& file, UnicodeData& unicode_data)
|
||||
{
|
||||
Array<u8, 1024> buffer;
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include <AK/Vector.h>
|
||||
#include <LibCore/File.h>
|
||||
#include <LibLocale/Locale.h>
|
||||
#include <LibUnicode/CharacterTypes.h>
|
||||
|
||||
template<class T>
|
||||
inline constexpr bool StorageTypeIsList = false;
|
||||
|
@ -598,3 +599,33 @@ ReadonlySpan<StringView> @name@()
|
|||
}
|
||||
)~~~");
|
||||
}
|
||||
|
||||
inline Vector<u32> parse_code_point_list(StringView list)
|
||||
{
|
||||
Vector<u32> code_points;
|
||||
|
||||
auto segments = list.split_view(' ');
|
||||
for (auto const& code_point : segments)
|
||||
code_points.append(AK::StringUtils::convert_to_uint_from_hex<u32>(code_point).value());
|
||||
|
||||
return code_points;
|
||||
}
|
||||
|
||||
inline Unicode::CodePointRange parse_code_point_range(StringView list)
|
||||
{
|
||||
Unicode::CodePointRange code_point_range {};
|
||||
|
||||
if (list.contains(".."sv)) {
|
||||
auto segments = list.split_view(".."sv);
|
||||
VERIFY(segments.size() == 2);
|
||||
|
||||
auto begin = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[0]).value();
|
||||
auto end = AK::StringUtils::convert_to_uint_from_hex<u32>(segments[1]).value();
|
||||
code_point_range = { begin, end };
|
||||
} else {
|
||||
auto code_point = AK::StringUtils::convert_to_uint_from_hex<u32>(list).value();
|
||||
code_point_range = { code_point, code_point };
|
||||
}
|
||||
|
||||
return code_point_range;
|
||||
}
|
||||
|
|
34
Userland/Libraries/LibUnicode/IDNA.h
Normal file
34
Userland/Libraries/LibUnicode/IDNA.h
Normal file
|
@ -0,0 +1,34 @@
|
|||
/*
|
||||
* Copyright (c) 2023, Simon Wanner <simon@skyrising.xyz>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Utf32View.h>
|
||||
|
||||
namespace Unicode::IDNA {
|
||||
|
||||
enum class MappingStatus : u8 {
|
||||
Valid,
|
||||
Ignored,
|
||||
Mapped,
|
||||
Deviation,
|
||||
Disallowed,
|
||||
DisallowedStd3Valid,
|
||||
DisallowedStd3Mapped,
|
||||
};
|
||||
|
||||
enum class IDNA2008Status : u8 {
|
||||
NV8,
|
||||
XV8,
|
||||
};
|
||||
|
||||
struct Mapping {
|
||||
MappingStatus status;
|
||||
IDNA2008Status idna_2008_status;
|
||||
Utf32View mapped_to;
|
||||
};
|
||||
|
||||
}
|
Loading…
Add table
Reference in a new issue