From c804bda5fd757a396f056fb22f28c63b3b47605a Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Fri, 21 Jun 2024 10:39:40 -0400 Subject: [PATCH] LibUnicode: Replace code point properties with ICU --- Meta/CMake/unicode_data.cmake | 26 +- .../LibUnicode/GenerateUnicodeData.cpp | 173 +------------ Userland/Libraries/LibGfx/TextLayout.cpp | 6 +- Userland/Libraries/LibJS/Lexer.cpp | 6 +- Userland/Libraries/LibRegex/RegexParser.cpp | 14 +- .../Libraries/LibUnicode/CharacterTypes.cpp | 232 +++++++++++++----- .../Libraries/LibUnicode/CharacterTypes.h | 9 + Userland/Libraries/LibUnicode/Emoji.cpp | 24 +- Userland/Libraries/LibUnicode/Forward.h | 4 +- 9 files changed, 196 insertions(+), 298 deletions(-) diff --git a/Meta/CMake/unicode_data.cmake b/Meta/CMake/unicode_data.cmake index 9e3c7129ba5..490caec9d94 100644 --- a/Meta/CMake/unicode_data.cmake +++ b/Meta/CMake/unicode_data.cmake @@ -16,18 +16,6 @@ set(UNICODE_DATA_PATH "${UCD_PATH}/${UNICODE_DATA_SOURCE}") set(DERIVED_GENERAL_CATEGORY_SOURCE "extracted/DerivedGeneralCategory.txt") set(DERIVED_GENERAL_CATEGORY_PATH "${UCD_PATH}/${DERIVED_GENERAL_CATEGORY_SOURCE}") -set(PROP_LIST_SOURCE "PropList.txt") -set(PROP_LIST_PATH "${UCD_PATH}/${PROP_LIST_SOURCE}") - -set(DERIVED_CORE_PROP_SOURCE "DerivedCoreProperties.txt") -set(DERIVED_CORE_PROP_PATH "${UCD_PATH}/${DERIVED_CORE_PROP_SOURCE}") - -set(DERIVED_BINARY_PROP_SOURCE "extracted/DerivedBinaryProperties.txt") -set(DERIVED_BINARY_PROP_PATH "${UCD_PATH}/${DERIVED_BINARY_PROP_SOURCE}") - -set(PROP_ALIAS_SOURCE "PropertyAliases.txt") -set(PROP_ALIAS_PATH "${UCD_PATH}/${PROP_ALIAS_SOURCE}") - set(PROP_VALUE_ALIAS_SOURCE "PropertyValueAliases.txt") set(PROP_VALUE_ALIAS_PATH "${UCD_PATH}/${PROP_VALUE_ALIAS_SOURCE}") @@ -37,12 +25,6 @@ set(SCRIPTS_PATH "${UCD_PATH}/${SCRIPTS_SOURCE}") set(SCRIPT_EXTENSIONS_SOURCE "ScriptExtensions.txt") set(SCRIPT_EXTENSIONS_PATH "${UCD_PATH}/${SCRIPT_EXTENSIONS_SOURCE}") -set(EMOJI_DATA_SOURCE "emoji/emoji-data.txt") -set(EMOJI_DATA_PATH "${UCD_PATH}/${EMOJI_DATA_SOURCE}") - -set(NORM_PROPS_SOURCE "DerivedNormalizationProps.txt") -set(NORM_PROPS_PATH "${UCD_PATH}/${NORM_PROPS_SOURCE}") - string(REGEX REPLACE "([0-9]+\\.[0-9]+)\\.[0-9]+" "\\1" EMOJI_VERSION "${UCD_VERSION}") set(EMOJI_TEST_URL "https://www.unicode.org/Public/emoji/${EMOJI_VERSION}/emoji-test.txt") set(EMOJI_TEST_PATH "${UCD_PATH}/emoji-test.txt") @@ -58,15 +40,9 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) download_file("${UCD_ZIP_URL}" "${UCD_ZIP_PATH}" SHA256 "${UCD_SHA256}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${UNICODE_DATA_SOURCE}" "${UNICODE_DATA_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_GENERAL_CATEGORY_SOURCE}" "${DERIVED_GENERAL_CATEGORY_PATH}") - extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_LIST_SOURCE}" "${PROP_LIST_PATH}") - extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_CORE_PROP_SOURCE}" "${DERIVED_CORE_PROP_PATH}") - extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${DERIVED_BINARY_PROP_SOURCE}" "${DERIVED_BINARY_PROP_PATH}") - extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_ALIAS_SOURCE}" "${PROP_ALIAS_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${PROP_VALUE_ALIAS_SOURCE}" "${PROP_VALUE_ALIAS_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SCRIPTS_SOURCE}" "${SCRIPTS_PATH}") extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${SCRIPT_EXTENSIONS_SOURCE}" "${SCRIPT_EXTENSIONS_PATH}") - extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${EMOJI_DATA_SOURCE}" "${EMOJI_DATA_PATH}") - extract_path("${UCD_PATH}" "${UCD_ZIP_PATH}" "${NORM_PROPS_SOURCE}" "${NORM_PROPS_PATH}") download_file("${EMOJI_TEST_URL}" "${EMOJI_TEST_PATH}" SHA256 "${EMOJI_SHA256}") else() @@ -91,7 +67,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) "${UCD_VERSION_FILE}" "${UNICODE_DATA_HEADER}" "${UNICODE_DATA_IMPLEMENTATION}" - arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -p "${PROP_LIST_PATH}" -d "${DERIVED_CORE_PROP_PATH}" -b "${DERIVED_BINARY_PROP_PATH}" -a "${PROP_ALIAS_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" -e "${EMOJI_DATA_PATH}" -n "${NORM_PROPS_PATH}" + arguments -u "${UNICODE_DATA_PATH}" -g "${DERIVED_GENERAL_CATEGORY_PATH}" -v "${PROP_VALUE_ALIAS_PATH}" -r "${SCRIPTS_PATH}" -x "${SCRIPT_EXTENSIONS_PATH}" ) invoke_generator( "EmojiData" diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 591e8e3f577..327e53b29db 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -24,21 +24,6 @@ // https://www.unicode.org/reports/tr44/#PropList.txt using PropList = HashMap>; -// https://www.unicode.org/reports/tr44/#DerivedNormalizationProps.txt -enum class QuickCheck { - Yes, - No, - Maybe, -}; - -struct Normalization { - Unicode::CodePointRange code_point_range; - Vector value; - QuickCheck quick_check { QuickCheck::Yes }; -}; - -using NormalizationProps = HashMap>; - // https://www.unicode.org/reports/tr44/#UnicodeData.txt struct CodePointData { u32 code_point { 0 }; @@ -79,27 +64,13 @@ struct UnicodeData { PropList general_categories; Vector general_category_aliases; - // The Unicode standard defines additional properties (Any, Assigned, ASCII) which are not in - // any UCD file. Assigned code point ranges are derived as this generator is executed. - // https://unicode.org/reports/tr18/#General_Category_Property - PropList prop_list { - { "Any"sv, { { 0, 0x10ffff } } }, - { "Assigned"sv, {} }, - { "ASCII"sv, { { 0, 0x7f } } }, - }; - Vector prop_aliases; - PropList script_list { { "Unknown"sv, {} }, }; Vector script_aliases; PropList script_extensions; - // FIXME: We are not yet doing anything with this data. It will be needed for String.prototype.normalize. - NormalizationProps normalization_props; - CodePointTables general_category_tables; - CodePointTables property_tables; CodePointTables script_tables; CodePointTables script_extension_tables; @@ -172,52 +143,6 @@ static ErrorOr parse_prop_list(Core::InputBufferedFile& file, PropList& pr return {}; } -static ErrorOr parse_alias_list(Core::InputBufferedFile& file, PropList const& prop_list, Vector& prop_aliases) -{ - ByteString current_property; - Array buffer; - - auto append_alias = [&](auto alias, auto property) { - // Note: The alias files contain lines such as "Hyphen = Hyphen", which we should just skip. - if (alias == property) - return; - - // FIXME: We will, eventually, need to find where missing properties are located and parse them. - if (!prop_list.contains(property)) - return; - - prop_aliases.append({ property, alias }); - }; - - while (TRY(file.can_read_line())) { - auto line = TRY(file.read_line(buffer)); - - if (line.is_empty() || line.starts_with('#')) { - if (line.ends_with("Properties"sv)) - current_property = line.substring_view(2); - continue; - } - - // Note: For now, we only care about Binary Property aliases for Unicode property escapes. - if (current_property != "Binary Properties"sv) - continue; - - auto segments = line.split_view(';', SplitBehavior::KeepEmpty); - VERIFY((segments.size() == 2) || (segments.size() == 3)); - - auto alias = segments[0].trim_whitespace(); - auto property = segments[1].trim_whitespace(); - append_alias(alias, property); - - if (segments.size() == 3) { - alias = segments[2].trim_whitespace(); - append_alias(alias, property); - } - } - - return {}; -} - static ErrorOr parse_value_alias_list(Core::InputBufferedFile& file, StringView desired_category, Vector const& value_list, Vector& prop_aliases, bool primary_value_is_first = true, bool sanitize_alias = false) { TRY(file.seek(0, SeekMode::SetPosition)); @@ -264,57 +189,9 @@ static ErrorOr parse_value_alias_list(Core::InputBufferedFile& file, Strin return {}; } -static ErrorOr parse_normalization_props(Core::InputBufferedFile& file, UnicodeData& unicode_data) -{ - Array buffer; - - while (TRY(file.can_read_line())) { - auto line = TRY(file.read_line(buffer)); - - if (line.is_empty() || line.starts_with('#')) - continue; - - if (auto index = line.find('#'); index.has_value()) - line = line.substring_view(0, *index); - - auto segments = line.split_view(';', SplitBehavior::KeepEmpty); - VERIFY((segments.size() == 2) || (segments.size() == 3)); - - auto code_point_range = parse_code_point_range(segments[0].trim_whitespace()); - auto property = segments[1].trim_whitespace().to_byte_string(); - - Vector value; - QuickCheck quick_check = QuickCheck::Yes; - - if (segments.size() == 3) { - auto value_or_quick_check = segments[2].trim_whitespace(); - - if ((value_or_quick_check == "N"sv)) - quick_check = QuickCheck::No; - else if ((value_or_quick_check == "M"sv)) - quick_check = QuickCheck::Maybe; - else - value = parse_code_point_list(value_or_quick_check); - } - - auto& normalizations = unicode_data.normalization_props.ensure(property); - normalizations.append({ code_point_range, move(value), quick_check }); - - auto& prop_list = unicode_data.prop_list.ensure(property); - prop_list.append(move(code_point_range)); - } - - return {}; -} - static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeData& unicode_data) { Optional code_point_range_start; - - auto& assigned_code_points = unicode_data.prop_list.find("Assigned"sv)->value; - Optional assigned_code_point_range_start = 0; - u32 previous_code_point = 0; - Array buffer; while (TRY(file.can_read_line())) { @@ -337,22 +214,15 @@ static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa data.unicode_1_name = segments[10]; data.iso_comment = segments[11]; - if (!assigned_code_point_range_start.has_value()) - assigned_code_point_range_start = data.code_point; - if (data.name.starts_with("<"sv) && data.name.ends_with(", First>"sv)) { - VERIFY(!code_point_range_start.has_value() && assigned_code_point_range_start.has_value()); + VERIFY(!code_point_range_start.has_value()); code_point_range_start = data.code_point; data.name = data.name.substring(1, data.name.length() - 9); - - assigned_code_points.append({ *assigned_code_point_range_start, previous_code_point }); - assigned_code_point_range_start.clear(); } else if (data.name.starts_with("<"sv) && data.name.ends_with(", Last>"sv)) { VERIFY(code_point_range_start.has_value()); Unicode::CodePointRange code_point_range { *code_point_range_start, data.code_point }; - assigned_code_points.append(code_point_range); data.name = data.name.substring(1, data.name.length() - 8); code_point_range_start.clear(); @@ -360,18 +230,9 @@ static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa unicode_data.code_point_bidirectional_classes.append({ code_point_range, data.bidi_class }); } else { unicode_data.code_point_bidirectional_classes.append({ { data.code_point, data.code_point }, data.bidi_class }); - - if ((data.code_point > 0) && (data.code_point - previous_code_point) != 1) { - VERIFY(assigned_code_point_range_start.has_value()); - - assigned_code_points.append({ *assigned_code_point_range_start, previous_code_point }); - assigned_code_point_range_start = data.code_point; - } } unicode_data.bidirectional_classes.set(data.bidi_class, AK::HashSetExistingEntryBehavior::Keep); - - previous_code_point = data.code_point; unicode_data.code_point_data.append(move(data)); } @@ -430,7 +291,6 @@ namespace Unicode { )~~~"); generate_enum("GeneralCategory"sv, {}, unicode_data.general_categories.keys(), unicode_data.general_category_aliases); - generate_enum("Property"sv, {}, unicode_data.prop_list.keys(), unicode_data.prop_aliases); generate_enum("Script"sv, {}, unicode_data.script_list.keys(), unicode_data.script_aliases); generate_enum("BidirectionalClass"sv, {}, unicode_data.bidirectional_classes.values()); @@ -460,7 +320,6 @@ static ErrorOr generate_unicode_data_implementation(Core::InputBufferedFil #include #include #include -#include namespace Unicode { )~~~"); @@ -547,7 +406,6 @@ static constexpr Array<@type@, @size@> @name@ { { }; TRY(append_code_point_tables("s_general_categories"sv, unicode_data.general_category_tables, append_property_table)); - TRY(append_code_point_tables("s_properties"sv, unicode_data.property_tables, append_property_table)); TRY(append_code_point_tables("s_scripts"sv, unicode_data.script_tables, append_property_table)); TRY(append_code_point_tables("s_script_extensions"sv, unicode_data.script_extension_tables, append_property_table)); @@ -634,9 +492,6 @@ bool code_point_has_@enum_snake@(u32 code_point, @enum_title@ @enum_snake@) TRY(append_prop_search("GeneralCategory"sv, "general_category"sv, "s_general_categories"sv)); TRY(append_from_string("GeneralCategory"sv, "general_category"sv, unicode_data.general_categories, unicode_data.general_category_aliases)); - TRY(append_prop_search("Property"sv, "property"sv, "s_properties"sv)); - TRY(append_from_string("Property"sv, "property"sv, unicode_data.prop_list, unicode_data.prop_aliases)); - TRY(append_prop_search("Script"sv, "script"sv, "s_scripts"sv)); TRY(append_prop_search("Script"sv, "script_extension"sv, "s_script_extensions"sv)); TRY(append_from_string("Script"sv, "script"sv, unicode_data.script_list, unicode_data.script_aliases)); @@ -901,13 +756,11 @@ static ErrorOr create_code_point_tables(UnicodeData& unicode_data) }; auto general_category_metadata = TRY(PropertyMetadata::create(unicode_data.general_categories)); - auto property_metadata = TRY(PropertyMetadata::create(unicode_data.prop_list)); auto script_metadata = TRY(PropertyMetadata::create(unicode_data.script_list)); auto script_extension_metadata = TRY(PropertyMetadata::create(unicode_data.script_extensions)); for (u32 code_point = 0; code_point <= MAX_CODE_POINT; ++code_point) { TRY(update_property_tables(code_point, unicode_data.general_category_tables, general_category_metadata)); - TRY(update_property_tables(code_point, unicode_data.property_tables, property_metadata)); TRY(update_property_tables(code_point, unicode_data.script_tables, script_metadata)); TRY(update_property_tables(code_point, unicode_data.script_extension_tables, script_extension_metadata)); } @@ -921,54 +774,30 @@ ErrorOr serenity_main(Main::Arguments arguments) StringView generated_implementation_path; StringView unicode_data_path; StringView derived_general_category_path; - StringView prop_list_path; - StringView derived_core_prop_path; - StringView derived_binary_prop_path; - StringView prop_alias_path; StringView prop_value_alias_path; StringView scripts_path; StringView script_extensions_path; - StringView emoji_data_path; - StringView normalization_path; Core::ArgsParser args_parser; args_parser.add_option(generated_header_path, "Path to the Unicode Data header file to generate", "generated-header-path", 'h', "generated-header-path"); args_parser.add_option(generated_implementation_path, "Path to the Unicode Data implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path"); args_parser.add_option(unicode_data_path, "Path to UnicodeData.txt file", "unicode-data-path", 'u', "unicode-data-path"); args_parser.add_option(derived_general_category_path, "Path to DerivedGeneralCategory.txt file", "derived-general-category-path", 'g', "derived-general-category-path"); - args_parser.add_option(prop_list_path, "Path to PropList.txt file", "prop-list-path", 'p', "prop-list-path"); - args_parser.add_option(derived_core_prop_path, "Path to DerivedCoreProperties.txt file", "derived-core-prop-path", 'd', "derived-core-prop-path"); - args_parser.add_option(derived_binary_prop_path, "Path to DerivedBinaryProperties.txt file", "derived-binary-prop-path", 'b', "derived-binary-prop-path"); - args_parser.add_option(prop_alias_path, "Path to PropertyAliases.txt file", "prop-alias-path", 'a', "prop-alias-path"); args_parser.add_option(prop_value_alias_path, "Path to PropertyValueAliases.txt file", "prop-value-alias-path", 'v', "prop-value-alias-path"); args_parser.add_option(scripts_path, "Path to Scripts.txt file", "scripts-path", 'r', "scripts-path"); args_parser.add_option(script_extensions_path, "Path to ScriptExtensions.txt file", "script-extensions-path", 'x', "script-extensions-path"); - args_parser.add_option(emoji_data_path, "Path to emoji-data.txt file", "emoji-data-path", 'e', "emoji-data-path"); - args_parser.add_option(normalization_path, "Path to DerivedNormalizationProps.txt file", "normalization-path", 'n', "normalization-path"); args_parser.parse(arguments); auto generated_header_file = TRY(open_file(generated_header_path, Core::File::OpenMode::Write)); auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write)); auto unicode_data_file = TRY(open_file(unicode_data_path, Core::File::OpenMode::Read)); auto derived_general_category_file = TRY(open_file(derived_general_category_path, Core::File::OpenMode::Read)); - auto prop_list_file = TRY(open_file(prop_list_path, Core::File::OpenMode::Read)); - auto derived_core_prop_file = TRY(open_file(derived_core_prop_path, Core::File::OpenMode::Read)); - auto derived_binary_prop_file = TRY(open_file(derived_binary_prop_path, Core::File::OpenMode::Read)); - auto prop_alias_file = TRY(open_file(prop_alias_path, Core::File::OpenMode::Read)); auto prop_value_alias_file = TRY(open_file(prop_value_alias_path, Core::File::OpenMode::Read)); auto scripts_file = TRY(open_file(scripts_path, Core::File::OpenMode::Read)); auto script_extensions_file = TRY(open_file(script_extensions_path, Core::File::OpenMode::Read)); - auto emoji_data_file = TRY(open_file(emoji_data_path, Core::File::OpenMode::Read)); - auto normalization_file = TRY(open_file(normalization_path, Core::File::OpenMode::Read)); UnicodeData unicode_data {}; TRY(parse_prop_list(*derived_general_category_file, unicode_data.general_categories)); - TRY(parse_prop_list(*prop_list_file, unicode_data.prop_list)); - TRY(parse_prop_list(*derived_core_prop_file, unicode_data.prop_list)); - TRY(parse_prop_list(*derived_binary_prop_file, unicode_data.prop_list)); - TRY(parse_prop_list(*emoji_data_file, unicode_data.prop_list)); - TRY(parse_normalization_props(*normalization_file, unicode_data)); - TRY(parse_alias_list(*prop_alias_file, unicode_data.prop_list, unicode_data.prop_aliases)); TRY(parse_prop_list(*scripts_file, unicode_data.script_list)); TRY(parse_prop_list(*script_extensions_file, unicode_data.script_extensions, true)); diff --git a/Userland/Libraries/LibGfx/TextLayout.cpp b/Userland/Libraries/LibGfx/TextLayout.cpp index bc4a5b3cf0a..db55074d9b7 100644 --- a/Userland/Libraries/LibGfx/TextLayout.cpp +++ b/Userland/Libraries/LibGfx/TextLayout.cpp @@ -19,16 +19,12 @@ DrawGlyphOrEmoji prepare_draw_glyph_or_emoji(FloatPoint point, Utf8CodePointIter auto next_code_point = it.peek(1); ScopeGuard consume_variation_selector = [&, initial_it = it] { - static auto const variation_selector = Unicode::property_from_string("Variation_Selector"sv); - if (!variation_selector.has_value()) - return; - // If we advanced the iterator to consume an emoji sequence, don't look for another variation selector. if (initial_it != it) return; // Otherwise, discard one code point if it's a variation selector. - if (next_code_point.has_value() && Unicode::code_point_has_property(*next_code_point, *variation_selector)) + if (next_code_point.has_value() && Unicode::code_point_has_variation_selector_property(*next_code_point)) ++it; }; diff --git a/Userland/Libraries/LibJS/Lexer.cpp b/Userland/Libraries/LibJS/Lexer.cpp index f26e5c62b7f..1c7a91e8dcd 100644 --- a/Userland/Libraries/LibJS/Lexer.cpp +++ b/Userland/Libraries/LibJS/Lexer.cpp @@ -468,8 +468,7 @@ Optional Lexer::is_identifier_start(size_t& identifier_length) const if (is_ascii(code_point)) return {}; - static auto id_start_category = Unicode::property_from_string("ID_Start"sv); - if (id_start_category.has_value() && Unicode::code_point_has_property(code_point, *id_start_category)) + if (Unicode::code_point_has_identifier_start_property(code_point)) return code_point; return {}; @@ -503,8 +502,7 @@ Optional Lexer::is_identifier_middle(size_t& identifier_length) const if (is_ascii(code_point)) return {}; - static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv); - if (id_continue_category.has_value() && Unicode::code_point_has_property(code_point, *id_continue_category)) + if (Unicode::code_point_has_identifier_continue_property(code_point)) return code_point; return {}; diff --git a/Userland/Libraries/LibRegex/RegexParser.cpp b/Userland/Libraries/LibRegex/RegexParser.cpp index e2696068ca7..e51bca2f2f7 100644 --- a/Userland/Libraries/LibRegex/RegexParser.cpp +++ b/Userland/Libraries/LibRegex/RegexParser.cpp @@ -1644,7 +1644,7 @@ bool ECMA262Parser::parse_atom_escape(ByteCode& stack, size_t& match_length_mini compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 }); property.visit( [&](Unicode::Property property) { - compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property }); + compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() }); }, [&](Unicode::GeneralCategory general_category) { compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category }); @@ -1996,7 +1996,7 @@ bool ECMA262Parser::parse_nonempty_class_ranges(Vector& ranges.empend(CompareTypeAndValuePair { CharacterCompareType::TemporaryInverse, 0 }); if (atom.is_property) - ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property) }); + ranges.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)(atom.property.value()) }); else if (atom.is_general_category) ranges.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)(atom.general_category) }); else if (atom.is_script) @@ -2332,7 +2332,7 @@ bool ECMA262Parser::parse_class_set_operand(Vector& c compares.empend(CompareTypeAndValuePair { CharacterCompareType::Inverse, 0 }); property.visit( [&](Unicode::Property property) { - compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property }); + compares.empend(CompareTypeAndValuePair { CharacterCompareType::Property, (ByteCodeValueType)property.value() }); }, [&](Unicode::GeneralCategory general_category) { compares.empend(CompareTypeAndValuePair { CharacterCompareType::GeneralCategory, (ByteCodeValueType)general_category }); @@ -2476,8 +2476,6 @@ bool ECMA262Parser::parse_unicode_property_escape(PropertyEscape& property, bool DeprecatedFlyString ECMA262Parser::read_capture_group_specifier(bool take_starting_angle_bracket) { - static auto id_start_category = Unicode::property_from_string("ID_Start"sv); - static auto id_continue_category = Unicode::property_from_string("ID_Continue"sv); static constexpr u32 const REPLACEMENT_CHARACTER = 0xFFFD; constexpr u32 const ZERO_WIDTH_NON_JOINER { 0x200C }; constexpr u32 const ZERO_WIDTH_JOINER { 0x200D }; @@ -2526,7 +2524,7 @@ DeprecatedFlyString ECMA262Parser::read_capture_group_specifier(bool take_starti set_error(Error::InvalidNameForCaptureGroup); return {}; } - } else if (id_start_category.has_value() && !Unicode::code_point_has_property(code_point, *id_start_category)) { + } else if (!Unicode::code_point_has_identifier_start_property(code_point)) { set_error(Error::InvalidNameForCaptureGroup); return {}; } @@ -2569,7 +2567,7 @@ DeprecatedFlyString ECMA262Parser::read_capture_group_specifier(bool take_starti return {}; } } else if (code_point != ZERO_WIDTH_JOINER && code_point != ZERO_WIDTH_NON_JOINER) { - if (id_continue_category.has_value() && !Unicode::code_point_has_property(code_point, *id_continue_category)) { + if (!Unicode::code_point_has_identifier_continue_property(code_point)) { set_error(Error::InvalidNameForCaptureGroup); return {}; } diff --git a/Userland/Libraries/LibUnicode/CharacterTypes.cpp b/Userland/Libraries/LibUnicode/CharacterTypes.cpp index 612eaacd269..5771ff33d02 100644 --- a/Userland/Libraries/LibUnicode/CharacterTypes.cpp +++ b/Userland/Libraries/LibUnicode/CharacterTypes.cpp @@ -4,84 +4,186 @@ * SPDX-License-Identifier: BSD-2-Clause */ +#include +#include +#include +#include #include -#if ENABLE_UNICODE_DATA -# include -#endif +#include + +namespace Unicode { + +template +struct PropertyName { + Optional long_name; + Optional short_name; + Optional additional_name; +}; + +// From uchar.h: +// Unicode allows for additional names, beyond the long and short name, which would be indicated by U_LONG_PROPERTY_NAME + i +static constexpr auto ADDITIONAL_NAME = static_cast(U_LONG_PROPERTY_NAME + 1); + +} + +template +struct AK::Traits> { + static constexpr bool equals(Unicode::PropertyName const& candidate, StringView property) + { + return property == candidate.long_name || property == candidate.short_name || property == candidate.additional_name; + } +}; namespace Unicode { Optional __attribute__((weak)) general_category_from_string(StringView) { return {}; } bool __attribute__((weak)) code_point_has_general_category(u32, GeneralCategory) { return {}; } -Optional __attribute__((weak)) property_from_string(StringView) { return {}; } -bool __attribute__((weak)) code_point_has_property(u32, Property) { return {}; } -bool is_ecma262_property([[maybe_unused]] Property property) +static constexpr Property PROPERTY_ANY = UCHAR_BINARY_LIMIT + 1; +static constexpr Property PROPERTY_ASCII = UCHAR_BINARY_LIMIT + 2; +static constexpr Property PROPERTY_ASSIGNED = UCHAR_BINARY_LIMIT + 3; +static constexpr Property PROPERTY_LIMIT = UCHAR_BINARY_LIMIT + 4; + +Optional property_from_string(StringView property) { -#if ENABLE_UNICODE_DATA - // EMCA-262 only allows a subset of Unicode properties: https://tc39.es/ecma262/#table-binary-unicode-properties - switch (property) { - case Unicode::Property::ASCII: - case Unicode::Property::ASCII_Hex_Digit: - case Unicode::Property::Alphabetic: - case Unicode::Property::Any: - case Unicode::Property::Assigned: - case Unicode::Property::Bidi_Control: - case Unicode::Property::Bidi_Mirrored: - case Unicode::Property::Case_Ignorable: - case Unicode::Property::Cased: - case Unicode::Property::Changes_When_Casefolded: - case Unicode::Property::Changes_When_Casemapped: - case Unicode::Property::Changes_When_Lowercased: - case Unicode::Property::Changes_When_NFKC_Casefolded: - case Unicode::Property::Changes_When_Titlecased: - case Unicode::Property::Changes_When_Uppercased: - case Unicode::Property::Dash: - case Unicode::Property::Default_Ignorable_Code_Point: - case Unicode::Property::Deprecated: - case Unicode::Property::Diacritic: - case Unicode::Property::Emoji: - case Unicode::Property::Emoji_Component: - case Unicode::Property::Emoji_Modifier: - case Unicode::Property::Emoji_Modifier_Base: - case Unicode::Property::Emoji_Presentation: - case Unicode::Property::Extended_Pictographic: - case Unicode::Property::Extender: - case Unicode::Property::Grapheme_Base: - case Unicode::Property::Grapheme_Extend: - case Unicode::Property::Hex_Digit: - case Unicode::Property::IDS_Binary_Operator: - case Unicode::Property::IDS_Trinary_Operator: - case Unicode::Property::ID_Continue: - case Unicode::Property::ID_Start: - case Unicode::Property::Ideographic: - case Unicode::Property::Join_Control: - case Unicode::Property::Logical_Order_Exception: - case Unicode::Property::Lowercase: - case Unicode::Property::Math: - case Unicode::Property::Noncharacter_Code_Point: - case Unicode::Property::Pattern_Syntax: - case Unicode::Property::Pattern_White_Space: - case Unicode::Property::Quotation_Mark: - case Unicode::Property::Radical: - case Unicode::Property::Regional_Indicator: - case Unicode::Property::Sentence_Terminal: - case Unicode::Property::Soft_Dotted: - case Unicode::Property::Terminal_Punctuation: - case Unicode::Property::Unified_Ideograph: - case Unicode::Property::Uppercase: - case Unicode::Property::Variation_Selector: - case Unicode::Property::White_Space: - case Unicode::Property::XID_Continue: - case Unicode::Property::XID_Start: + static auto property_names = []() { + Array, PROPERTY_LIMIT.value()> names; + + for (Property property = 0; property < UCHAR_BINARY_LIMIT; ++property) { + auto icu_property = static_cast(property.value()); + + if (char const* name = u_getPropertyName(icu_property, U_LONG_PROPERTY_NAME)) + names[property.value()].long_name = StringView { name, strlen(name) }; + if (char const* name = u_getPropertyName(icu_property, U_SHORT_PROPERTY_NAME)) + names[property.value()].short_name = StringView { name, strlen(name) }; + if (char const* name = u_getPropertyName(icu_property, ADDITIONAL_NAME)) + names[property.value()].additional_name = StringView { name, strlen(name) }; + } + + names[PROPERTY_ANY.value()] = { "Any"sv, {}, {} }; + names[PROPERTY_ASCII.value()] = { "ASCII"sv, {}, {} }; + names[PROPERTY_ASSIGNED.value()] = { "Assigned"sv, {}, {} }; + + return names; + }(); + + if (auto index = find_index(property_names.begin(), property_names.end(), property); index != property_names.size()) + return static_cast(index); + return {}; +} + +bool code_point_has_property(u32 code_point, Property property) +{ + auto icu_code_point = static_cast(code_point); + auto icu_property = static_cast(property.value()); + + if (property == PROPERTY_ANY) + return is_unicode(code_point); + if (property == PROPERTY_ASCII) + return is_ascii(code_point); + if (property == PROPERTY_ASSIGNED) + return u_isdefined(icu_code_point); + + return static_cast(u_hasBinaryProperty(icu_code_point, icu_property)); +} + +bool code_point_has_emoji_property(u32 code_point) +{ + return code_point_has_property(code_point, UCHAR_EMOJI); +} + +bool code_point_has_emoji_modifier_base_property(u32 code_point) +{ + return code_point_has_property(code_point, UCHAR_EMOJI_MODIFIER_BASE); +} + +bool code_point_has_emoji_presentation_property(u32 code_point) +{ + return code_point_has_property(code_point, UCHAR_EMOJI_PRESENTATION); +} + +bool code_point_has_identifier_start_property(u32 code_point) +{ + return u_isIDStart(static_cast(code_point)); +} + +bool code_point_has_identifier_continue_property(u32 code_point) +{ + return u_isIDPart(static_cast(code_point)); +} + +bool code_point_has_regional_indicator_property(u32 code_point) +{ + return code_point_has_property(code_point, UCHAR_REGIONAL_INDICATOR); +} + +bool code_point_has_variation_selector_property(u32 code_point) +{ + return code_point_has_property(code_point, UCHAR_VARIATION_SELECTOR); +} + +// https://tc39.es/ecma262/#table-binary-unicode-properties +bool is_ecma262_property(Property property) +{ + if (property == PROPERTY_ANY || property == PROPERTY_ASCII || property == PROPERTY_ASSIGNED) + return true; + + switch (property.value()) { + case UCHAR_ASCII_HEX_DIGIT: + case UCHAR_ALPHABETIC: + case UCHAR_BIDI_CONTROL: + case UCHAR_BIDI_MIRRORED: + case UCHAR_CASE_IGNORABLE: + case UCHAR_CASED: + case UCHAR_CHANGES_WHEN_CASEFOLDED: + case UCHAR_CHANGES_WHEN_CASEMAPPED: + case UCHAR_CHANGES_WHEN_LOWERCASED: + case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED: + case UCHAR_CHANGES_WHEN_TITLECASED: + case UCHAR_CHANGES_WHEN_UPPERCASED: + case UCHAR_DASH: + case UCHAR_DEFAULT_IGNORABLE_CODE_POINT: + case UCHAR_DEPRECATED: + case UCHAR_DIACRITIC: + case UCHAR_EMOJI: + case UCHAR_EMOJI_COMPONENT: + case UCHAR_EMOJI_MODIFIER: + case UCHAR_EMOJI_MODIFIER_BASE: + case UCHAR_EMOJI_PRESENTATION: + case UCHAR_EXTENDED_PICTOGRAPHIC: + case UCHAR_EXTENDER: + case UCHAR_GRAPHEME_BASE: + case UCHAR_GRAPHEME_EXTEND: + case UCHAR_HEX_DIGIT: + case UCHAR_IDS_BINARY_OPERATOR: + case UCHAR_IDS_TRINARY_OPERATOR: + case UCHAR_ID_CONTINUE: + case UCHAR_ID_START: + case UCHAR_IDEOGRAPHIC: + case UCHAR_JOIN_CONTROL: + case UCHAR_LOGICAL_ORDER_EXCEPTION: + case UCHAR_LOWERCASE: + case UCHAR_MATH: + case UCHAR_NONCHARACTER_CODE_POINT: + case UCHAR_PATTERN_SYNTAX: + case UCHAR_PATTERN_WHITE_SPACE: + case UCHAR_QUOTATION_MARK: + case UCHAR_RADICAL: + case UCHAR_REGIONAL_INDICATOR: + case UCHAR_S_TERM: + case UCHAR_SOFT_DOTTED: + case UCHAR_TERMINAL_PUNCTUATION: + case UCHAR_UNIFIED_IDEOGRAPH: + case UCHAR_UPPERCASE: + case UCHAR_VARIATION_SELECTOR: + case UCHAR_WHITE_SPACE: + case UCHAR_XID_CONTINUE: + case UCHAR_XID_START: return true; default: return false; } -#else - return false; -#endif } Optional