diff --git a/Meta/CMake/unicode_data.cmake b/Meta/CMake/unicode_data.cmake index 57cc7472d5c..a076031dd18 100644 --- a/Meta/CMake/unicode_data.cmake +++ b/Meta/CMake/unicode_data.cmake @@ -10,9 +10,6 @@ set(UCD_VERSION_FILE "${UCD_PATH}/version.txt") set(UCD_ZIP_URL "https://www.unicode.org/Public/${UCD_VERSION}/ucd/UCD.zip") set(UCD_ZIP_PATH "${UCD_PATH}/UCD.zip") -set(UNICODE_DATA_SOURCE "UnicodeData.txt") -set(UNICODE_DATA_PATH "${UCD_PATH}/${UNICODE_DATA_SOURCE}") - string(REGEX REPLACE "([0-9]+\\.[0-9]+)\\.[0-9]+" "\\1" EMOJI_VERSION "${UCD_VERSION}") set(EMOJI_TEST_URL "https://www.unicode.org/Public/emoji/${EMOJI_VERSION}/emoji-test.txt") set(EMOJI_TEST_PATH "${UCD_PATH}/emoji-test.txt") @@ -26,18 +23,12 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) if (ENABLE_NETWORK_DOWNLOADS) download_file("${UCD_ZIP_URL}" "${UCD_ZIP_PATH}" SHA256 "${UCD_SHA256}") - extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${UNICODE_DATA_SOURCE}" "${UNICODE_DATA_PATH}") - download_file("${EMOJI_TEST_URL}" "${EMOJI_TEST_PATH}" SHA256 "${EMOJI_SHA256}") else() message(STATUS "Skipping download of ${UCD_ZIP_URL}, expecting the archive to have been extracted to ${UCD_ZIP_PATH}") message(STATUS "Skipping download of ${EMOJI_TEST_URL}, expecting the file to be at ${EMOJI_TEST_PATH}") endif() - - set(UNICODE_DATA_HEADER UnicodeData.h) - set(UNICODE_DATA_IMPLEMENTATION UnicodeData.cpp) - set(EMOJI_DATA_HEADER EmojiData.h) set(EMOJI_DATA_IMPLEMENTATION EmojiData.cpp) @@ -45,14 +36,6 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) set(EMOJI_INSTALL_ARG -i "${EMOJI_INSTALL_PATH}") endif() - invoke_generator( - "UnicodeData" - Lagom::GenerateUnicodeData - "${UCD_VERSION_FILE}" - "${UNICODE_DATA_HEADER}" - "${UNICODE_DATA_IMPLEMENTATION}" - arguments -u "${UNICODE_DATA_PATH}" - ) invoke_generator( "EmojiData" Lagom::GenerateEmojiData @@ -69,8 +52,6 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) ) set(UNICODE_DATA_SOURCES - ${UNICODE_DATA_HEADER} - ${UNICODE_DATA_IMPLEMENTATION} ${EMOJI_DATA_HEADER} ${EMOJI_DATA_IMPLEMENTATION} ) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt index b18637a1848..81b415aa2d8 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/CMakeLists.txt @@ -1,2 +1 @@ -lagom_tool(GenerateUnicodeData SOURCES GenerateUnicodeData.cpp LIBS LibMain) lagom_tool(GenerateEmojiData SOURCES GenerateEmojiData.cpp LIBS LibMain) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp deleted file mode 100644 index fa23fa6f900..00000000000 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ /dev/null @@ -1,291 +0,0 @@ -/* - * Copyright (c) 2021, Tim Flynn - * - * SPDX-License-Identifier: BSD-2-Clause - */ - -#include "GeneratorUtil.h" -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -// https://www.unicode.org/reports/tr44/#PropList.txt -using PropList = HashMap>; - -// https://www.unicode.org/reports/tr44/#UnicodeData.txt -struct CodePointData { - u32 code_point { 0 }; - ByteString name; - ByteString bidi_class; - Optional numeric_value_decimal; - Optional numeric_value_digit; - Optional numeric_value_numeric; - bool bidi_mirrored { false }; - ByteString unicode_1_name; - ByteString iso_comment; -}; - -struct CodePointBidiClass { - Unicode::CodePointRange code_point_range; - ByteString bidi_class; -}; - -struct UnicodeData { - Vector code_point_data; - - HashTable bidirectional_classes; - Vector code_point_bidirectional_classes; -}; - -static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeData& unicode_data) -{ - Optional code_point_range_start; - Array buffer; - - while (TRY(file.can_read_line())) { - auto line = TRY(file.read_line(buffer)); - - if (line.is_empty()) - continue; - - auto segments = line.split_view(';', SplitBehavior::KeepEmpty); - VERIFY(segments.size() == 15); - - CodePointData data {}; - data.code_point = AK::StringUtils::convert_to_uint_from_hex(segments[0]).value(); - data.name = segments[1]; - data.bidi_class = segments[4]; - data.numeric_value_decimal = AK::StringUtils::convert_to_int(segments[6]); - data.numeric_value_digit = AK::StringUtils::convert_to_int(segments[7]); - data.numeric_value_numeric = AK::StringUtils::convert_to_int(segments[8]); - data.bidi_mirrored = segments[9] == "Y"sv; - data.unicode_1_name = segments[10]; - data.iso_comment = segments[11]; - - if (data.name.starts_with("<"sv) && data.name.ends_with(", First>"sv)) { - VERIFY(!code_point_range_start.has_value()); - code_point_range_start = data.code_point; - - data.name = data.name.substring(1, data.name.length() - 9); - } else if (data.name.starts_with("<"sv) && data.name.ends_with(", Last>"sv)) { - VERIFY(code_point_range_start.has_value()); - - Unicode::CodePointRange code_point_range { *code_point_range_start, data.code_point }; - - data.name = data.name.substring(1, data.name.length() - 8); - code_point_range_start.clear(); - - unicode_data.code_point_bidirectional_classes.append({ code_point_range, data.bidi_class }); - } else { - unicode_data.code_point_bidirectional_classes.append({ { data.code_point, data.code_point }, data.bidi_class }); - } - - unicode_data.bidirectional_classes.set(data.bidi_class, AK::HashSetExistingEntryBehavior::Keep); - unicode_data.code_point_data.append(move(data)); - } - - return {}; -} - -static ErrorOr generate_unicode_data_header(Core::InputBufferedFile& file, UnicodeData& unicode_data) -{ - StringBuilder builder; - SourceGenerator generator { builder }; - - auto generate_enum = [&](StringView name, StringView default_, auto values, Vector aliases = {}) { - quick_sort(values); - quick_sort(aliases, [](auto& alias1, auto& alias2) { return alias1.alias < alias2.alias; }); - - generator.set("name", name); - generator.set("underlying", ByteString::formatted("{}UnderlyingType", name)); - generator.set("type", ((values.size() + !default_.is_empty()) < 256) ? "u8"sv : "u16"sv); - - generator.append(R"~~~( -using @underlying@ = @type@; - -enum class @name@ : @underlying@ {)~~~"); - - if (!default_.is_empty()) { - generator.set("default", default_); - generator.append(R"~~~( - @default@,)~~~"); - } - - for (auto const& value : values) { - generator.set("value", value); - generator.append(R"~~~( - @value@,)~~~"); - } - - for (auto const& alias : aliases) { - generator.set("alias", alias.alias); - generator.set("value", alias.name); - generator.append(R"~~~( - @alias@ = @value@,)~~~"); - } - - generator.append(R"~~~( -}; -)~~~"); - }; - - generator.append(R"~~~( -#pragma once - -#include -#include - -namespace Unicode { -)~~~"); - - generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values()); - - generator.append(R"~~~( -} -)~~~"); - - TRY(file.write_until_depleted(generator.as_string_view().bytes())); - return {}; -} - -static ErrorOr generate_unicode_data_implementation(Core::InputBufferedFile& file, UnicodeData const& unicode_data) -{ - StringBuilder builder; - SourceGenerator generator { builder }; - - generator.append(R"~~~( -#include -#include -#include -#include -#include -#include -#include -#include -#include - -namespace Unicode { -)~~~"); - - generator.append(R"~~~( -struct BidiClassData { - CodePointRange code_point_range {}; - BidirectionalClass bidi_class {}; -}; - -struct CodePointBidiClassComparator : public CodePointRangeComparator { - constexpr int operator()(u32 code_point, BidiClassData const& bidi_class) - { - return CodePointRangeComparator::operator()(code_point, bidi_class.code_point_range); - } -}; - -)~~~"); - - { - constexpr size_t max_bidi_classes_per_row = 20; - size_t bidi_classes_in_current_row = 0; - - generator.set("size"sv, ByteString::number(unicode_data.code_point_bidirectional_classes.size())); - generator.append(R"~~~( -static constexpr Array s_bidirectional_classes { { -)~~~"); - for (auto const& data : unicode_data.code_point_bidirectional_classes) { - if (bidi_classes_in_current_row++ > 0) - generator.append(", "); - - generator.set("first", ByteString::formatted("{:#x}", data.code_point_range.first)); - generator.set("last", ByteString::formatted("{:#x}", data.code_point_range.last)); - generator.set("bidi_class", data.bidi_class); - generator.append("{ { @first@, @last@ }, BidirectionalClass::@bidi_class@ }"); - - if (bidi_classes_in_current_row == max_bidi_classes_per_row) { - bidi_classes_in_current_row = 0; - generator.append(",\n "); - } - } - generator.append(R"~~~( -} }; -)~~~"); - } - - generator.append(R"~~~( -Optional bidirectional_class(u32 code_point) -{ - if (auto const* entry = binary_search(s_bidirectional_classes, code_point, nullptr, CodePointBidiClassComparator {})) - return entry->bidi_class; - - return {}; -} -)~~~"); - - auto append_from_string = [&](StringView enum_title, StringView enum_snake, auto const& prop_list, Vector const& aliases) -> ErrorOr { - HashValueMap hashes; - TRY(hashes.try_ensure_capacity(prop_list.size() + aliases.size())); - - ValueFromStringOptions options {}; - - for (auto const& prop : prop_list) { - if constexpr (IsSame, ByteString>) { - hashes.set(CaseInsensitiveASCIIStringViewTraits::hash(prop), prop); - options.sensitivity = CaseSensitivity::CaseInsensitive; - } else { - hashes.set(prop.key.hash(), prop.key); - } - } - - for (auto const& alias : aliases) - hashes.set(alias.alias.hash(), alias.alias); - - generate_value_from_string(generator, "{}_from_string"sv, enum_title, enum_snake, move(hashes), options); - - return {}; - }; - - TRY(append_from_string("BidirectionalClass"sv, "bidirectional_class"sv, unicode_data.bidirectional_classes, {})); - - generator.append(R"~~~( -} -)~~~"); - - TRY(file.write_until_depleted(generator.as_string_view().bytes())); - return {}; -} - -ErrorOr serenity_main(Main::Arguments arguments) -{ - StringView generated_header_path; - StringView generated_implementation_path; - StringView unicode_data_path; - - Core::ArgsParser args_parser; - args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path"); - args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path"); - args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path"); - args_parser.parse(arguments); - - auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write)); - auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write)); - auto unicode_data_file = TRY(open_file(unicode_data_path, Core::File::OpenMode::Read)); - - UnicodeData unicode_data {}; - - TRY(parse_unicode_data(*unicode_data_file, unicode_data)); - - TRY(generate_unicode_data_header(*generated_header_file, unicode_data)); - TRY(generate_unicode_data_implementation(*generated_implementation_file, unicode_data)); - - return 0; -} diff --git a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp index bfc175165e1..b5b033ceda2 100644 --- a/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp +++ b/Tests/LibUnicode/TestUnicodeCharacterTypes.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -307,25 +307,13 @@ TEST_CASE(script_extension) TEST_CASE(code_point_bidirectional_character_type) { - auto code_point_bidi_class = [](u32 code_point) { - auto bidi_class = Unicode::bidirectional_class(code_point); - VERIFY(bidi_class.has_value()); - return bidi_class.release_value(); - }; - - auto bidi_class_from_string = [](StringView name) { - auto result = Unicode::bidirectional_class_from_string(name); - VERIFY(result.has_value()); - return result.release_value(); - }; - // Left-to-right - EXPECT_EQ(code_point_bidi_class('A'), bidi_class_from_string("L"sv)); - EXPECT_EQ(code_point_bidi_class('z'), bidi_class_from_string("L"sv)); + EXPECT_EQ(Unicode::bidirectional_class('A'), Unicode::BidiClass::LeftToRight); + EXPECT_EQ(Unicode::bidirectional_class('z'), Unicode::BidiClass::LeftToRight); // European number - EXPECT_EQ(code_point_bidi_class('7'), bidi_class_from_string("EN"sv)); + EXPECT_EQ(Unicode::bidirectional_class('7'), Unicode::BidiClass::EuropeanNumber); // Whitespace - EXPECT_EQ(code_point_bidi_class(' '), bidi_class_from_string("WS"sv)); + EXPECT_EQ(Unicode::bidirectional_class(' '), Unicode::BidiClass::WhiteSpaceNeutral); // Arabic right-to-left (U+FEB4 ARABIC LETTER SEEN MEDIAL FORM) - EXPECT_EQ(code_point_bidi_class(0xFEB4), bidi_class_from_string("AL"sv)); + EXPECT_EQ(Unicode::bidirectional_class(0xFEB4), Unicode::BidiClass::RightToLeftArabic); } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 4a02618e34c..967477c41e1 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -310,7 +310,67 @@ bool code_point_has_script_extension(u32 code_point, Script script) return static_cast(uscript_hasScript(icu_code_point, icu_script)); } -Optional __attribute__((weak)) bidirectional_class_from_string(StringView) { return {}; } -Optional __attribute__((weak)) bidirectional_class(u32) { return {}; } +static constexpr BidiClass char_direction_to_bidi_class(UCharDirection direction) +{ + switch (direction) { + case U_ARABIC_NUMBER: + return BidiClass::ArabicNumber; + case U_BLOCK_SEPARATOR: + return BidiClass::BlockSeparator; + case U_BOUNDARY_NEUTRAL: + return BidiClass::BoundaryNeutral; + case U_COMMON_NUMBER_SEPARATOR: + return BidiClass::CommonNumberSeparator; + case U_DIR_NON_SPACING_MARK: + return BidiClass::DirNonSpacingMark; + case U_EUROPEAN_NUMBER: + return BidiClass::EuropeanNumber; + case U_EUROPEAN_NUMBER_SEPARATOR: + return BidiClass::EuropeanNumberSeparator; + case U_EUROPEAN_NUMBER_TERMINATOR: + return BidiClass::EuropeanNumberTerminator; + case U_FIRST_STRONG_ISOLATE: + return BidiClass::FirstStrongIsolate; + case U_LEFT_TO_RIGHT: + return BidiClass::LeftToRight; + case U_LEFT_TO_RIGHT_EMBEDDING: + return BidiClass::LeftToRightEmbedding; + case U_LEFT_TO_RIGHT_ISOLATE: + return BidiClass::LeftToRightIsolate; + case U_LEFT_TO_RIGHT_OVERRIDE: + return BidiClass::LeftToRightOverride; + case U_OTHER_NEUTRAL: + return BidiClass::OtherNeutral; + case U_POP_DIRECTIONAL_FORMAT: + return BidiClass::PopDirectionalFormat; + case U_POP_DIRECTIONAL_ISOLATE: + return BidiClass::PopDirectionalIsolate; + case U_RIGHT_TO_LEFT: + return BidiClass::RightToLeft; + case U_RIGHT_TO_LEFT_ARABIC: + return BidiClass::RightToLeftArabic; + case U_RIGHT_TO_LEFT_EMBEDDING: + return BidiClass::RightToLeftEmbedding; + case U_RIGHT_TO_LEFT_ISOLATE: + return BidiClass::RightToLeftIsolate; + case U_RIGHT_TO_LEFT_OVERRIDE: + return BidiClass::RightToLeftOverride; + case U_SEGMENT_SEPARATOR: + return BidiClass::SegmentSeparator; + case U_WHITE_SPACE_NEUTRAL: + return BidiClass::WhiteSpaceNeutral; + case U_CHAR_DIRECTION_COUNT: + break; + } + VERIFY_NOT_REACHED(); +} + +BidiClass bidirectional_class(u32 code_point) +{ + auto icu_code_point = static_cast(code_point); + + auto direction = u_charDirection(icu_code_point); + return char_direction_to_bidi_class(direction); +} } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.h b/Userland/Libraries/LibUnicode/CharacterTypes.h index 0674bd4cbfa..6521918b46d 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.h +++ b/Userland/Libraries/LibUnicode/CharacterTypes.h @@ -49,7 +49,32 @@ Optional