diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 5e49b1d4f70..7ed16ba1369 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -130,11 +130,17 @@ struct CodePointBidiClass { ByteString bidi_class; }; +struct CodePointComposition { + u32 second_code_point { 0 }; + u32 combined_code_point { 0 }; +}; + struct UnicodeData { UniqueStringStorage unique_strings; u32 code_points_with_decomposition_mapping { 0 }; Vector decomposition_mappings; + HashMap> composition_mappings; Vector compatibility_tags; Vector special_casing; @@ -635,6 +641,25 @@ static Optional parse_decomposition_mapping(StringView s return mapping; } +static void add_composition_mapping(u32 code_point, CodePointDecomposition& decomposition, UnicodeData& unicode_data, Vector const& full_composition_exclusion_code_points) +{ + if (decomposition.decomposition_size != 2) + return; + if (decomposition.tag != "Canonical"sv) + return; + static Unicode::CodePointRangeComparator comparator {}; + for (auto const& range : full_composition_exclusion_code_points) { + auto comparison = comparator(code_point, range); + if (comparison == 0) + return; + if (comparison < 0) + break; + } + u32 const first_code_point = unicode_data.decomposition_mappings[decomposition.decomposition_index]; + u32 const second_code_point = unicode_data.decomposition_mappings[decomposition.decomposition_index + 1]; + unicode_data.composition_mappings.ensure(first_code_point).append(CodePointComposition { .second_code_point = second_code_point, .combined_code_point = code_point }); +} + static ErrorOr parse_block_display_names(Core::InputBufferedFile& file, UnicodeData& unicode_data) { Array buffer; @@ -663,6 +688,7 @@ static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa Optional code_point_range_start; auto& assigned_code_points = unicode_data.prop_list.find("Assigned"sv)->value; + auto const& full_composition_exclusion_code_points = unicode_data.prop_list.find("Full_Composition_Exclusion"sv)->value; Optional assigned_code_point_range_start = 0; u32 previous_code_point = 0; @@ -741,6 +767,8 @@ static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa } unicode_data.code_points_with_decomposition_mapping += data.decomposition_mapping.has_value(); + if (data.decomposition_mapping.has_value()) + add_composition_mapping(data.code_point, *data.decomposition_mapping, unicode_data, full_composition_exclusion_code_points); unicode_data.bidirectional_classes.set(data.bidi_class, AK::HashSetExistingEntryBehavior::Keep); @@ -854,6 +882,12 @@ struct CodePointDecomposition { ReadonlySpan decomposition; }; +struct CodePointCompositionRaw { + u32 code_point { 0 }; + u32 second_code_point { 0 }; + u32 combined_code_point { 0 }; +}; + Optional locale_from_string(StringView locale); ReadonlySpan special_case_mapping(u32 code_point); @@ -1075,6 +1109,37 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; }); append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping, [](auto const& data) { return data.decomposition_mapping; }); + size_t composition_mappings_size = 0; + for (auto const& entry : unicode_data.composition_mappings) + composition_mappings_size += entry.value.size(); + generator.set("composition_mappings_size", ByteString::number(composition_mappings_size)); + generator.append(R"~~~( +static constexpr Array s_composition_mappings { { + )~~~"); + constexpr size_t max_mappings_per_row = 40; + size_t mappings_in_current_row = 0; + auto first_code_points = unicode_data.composition_mappings.keys(); + quick_sort(first_code_points); + for (auto const first_code_point : first_code_points) { + for (auto const& mapping : unicode_data.composition_mappings.find(first_code_point)->value) { + if (mappings_in_current_row++ > 0) + generator.append(" "); + + generator.set("code_point", ByteString::formatted("{:#x}", first_code_point)); + generator.set("second_code_point", ByteString::formatted("{:#x}", mapping.second_code_point)); + generator.set("combined_code_point", ByteString::formatted("{:#x}", mapping.combined_code_point)); + generator.append("{ @code_point@, @second_code_point@, @combined_code_point@ },"); + + if (mappings_in_current_row == max_mappings_per_row) { + mappings_in_current_row = 0; + generator.append("\n "); + } + } + } + generator.append(R"~~~( +} }; +)~~~"); + auto append_casing_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr { generator.set("name", TRY(String::formatted("{}_unique_properties", collection_snake))); generator.set("size", TRY(String::number(unique_properties.size()))); @@ -1365,12 +1430,18 @@ Optional code_point_decomposition(u32 code_point) return CodePointDecomposition { mapping->code_point, mapping->tag, ReadonlySpan { s_decomposition_mappings_data.data() + mapping->decomposition_index, mapping->decomposition_count } }; } -Optional code_point_decomposition_by_index(size_t index) +Optional code_point_composition(u32 first_code_point, u32 second_code_point) { - if (index >= s_decomposition_mappings.size()) + size_t mapping_index; + if (!binary_search(s_composition_mappings, first_code_point, &mapping_index, CodePointComparator {})) return {}; - auto const& mapping = s_decomposition_mappings[index]; - return CodePointDecomposition { mapping.code_point, mapping.tag, ReadonlySpan { s_decomposition_mappings_data.data() + mapping.decomposition_index, mapping.decomposition_count } }; + while (mapping_index > 0 && s_composition_mappings[mapping_index - 1].code_point == first_code_point) + mapping_index--; + for (; mapping_index < s_composition_mappings.size() && s_composition_mappings[mapping_index].code_point == first_code_point; ++mapping_index) { + if (s_composition_mappings[mapping_index].second_code_point == second_code_point) + return s_composition_mappings[mapping_index].combined_code_point; + } + return {}; } Optional bidirectional_class(u32 code_point) diff --git a/Userland/Libraries/LibUnicode/Normalize.cpp b/Userland/Libraries/LibUnicode/Normalize.cpp index c68dcc2b465..fbc2779ab90 100644 --- a/Userland/Libraries/LibUnicode/Normalize.cpp +++ b/Userland/Libraries/LibUnicode/Normalize.cpp @@ -20,7 +20,7 @@ struct Unicode::CodePointDecomposition { }; namespace Unicode { Optional __attribute__((weak)) code_point_decomposition(u32) { return {}; } -Optional __attribute__((weak)) code_point_decomposition_by_index(size_t) { return {}; } +Optional __attribute__((weak)) code_point_composition(u32, u32) { return {}; } NormalizationForm normalization_form_from_string(StringView form) { @@ -126,20 +126,9 @@ static u32 combine_hangul_code_points(u32 a, u32 b) static u32 combine_code_points([[maybe_unused]] u32 a, [[maybe_unused]] u32 b) { #if ENABLE_UNICODE_DATA - Array const points { a, b }; - - // FIXME: Do something better than linear search to find reverse mappings. - for (size_t index = 0;; ++index) { - auto mapping_maybe = Unicode::code_point_decomposition_by_index(index); - if (!mapping_maybe.has_value()) - break; - auto& mapping = mapping_maybe.value(); - if (mapping.tag == CompatibilityFormattingTag::Canonical && mapping.decomposition == points) { - if (code_point_has_property(mapping.code_point, Property::Full_Composition_Exclusion)) - continue; - return mapping.code_point; - } - } + auto composition = code_point_composition(a, b); + if (composition.has_value()) + return composition.value(); #endif return 0; diff --git a/Userland/Libraries/LibUnicode/Normalize.h b/Userland/Libraries/LibUnicode/Normalize.h index 17b34550962..73dbc21373e 100644 --- a/Userland/Libraries/LibUnicode/Normalize.h +++ b/Userland/Libraries/LibUnicode/Normalize.h @@ -17,7 +17,7 @@ namespace Unicode { Optional code_point_decomposition(u32 code_point); -Optional code_point_decomposition_by_index(size_t index); +Optional code_point_composition(u32 first_code_point, u32 second_code_point); enum class NormalizationForm { NFD,