From 945c58c7c1fe4612b86171e9c7eea4a7f78c6574 Mon Sep 17 00:00:00 2001 From: Idan Horowitz Date: Sat, 6 Apr 2024 18:45:52 +0300 Subject: [PATCH] LibUnicode: Generate and use code point composition mappings These allow us to binary search the code point compositions based on the first code point being combined, which makes the search close to O(log N) instead of O(N). --- .../LibUnicode/GenerateUnicodeData.cpp | 79 ++++++++++++++++++- Userland/Libraries/LibUnicode/Normalize.cpp | 19 +---- Userland/Libraries/LibUnicode/Normalize.h | 2 +- 3 files changed, 80 insertions(+), 20 deletions(-) diff --git a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp index 5e49b1d4f70..7ed16ba1369 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibUnicode/GenerateUnicodeData.cpp @@ -130,11 +130,17 @@ struct CodePointBidiClass { ByteString bidi_class; }; +struct CodePointComposition { + u32 second_code_point { 0 }; + u32 combined_code_point { 0 }; +}; + struct UnicodeData { UniqueStringStorage unique_strings; u32 code_points_with_decomposition_mapping { 0 }; Vector decomposition_mappings; + HashMap> composition_mappings; Vector compatibility_tags; Vector special_casing; @@ -635,6 +641,25 @@ static Optional parse_decomposition_mapping(StringView s return mapping; } +static void add_composition_mapping(u32 code_point, CodePointDecomposition& decomposition, UnicodeData& unicode_data, Vector const& full_composition_exclusion_code_points) +{ + if (decomposition.decomposition_size != 2) + return; + if (decomposition.tag != "Canonical"sv) + return; + static Unicode::CodePointRangeComparator comparator {}; + for (auto const& range : full_composition_exclusion_code_points) { + auto comparison = comparator(code_point, range); + if (comparison == 0) + return; + if (comparison < 0) + break; + } + u32 const first_code_point = unicode_data.decomposition_mappings[decomposition.decomposition_index]; + u32 const second_code_point = unicode_data.decomposition_mappings[decomposition.decomposition_index + 1]; + unicode_data.composition_mappings.ensure(first_code_point).append(CodePointComposition { .second_code_point = second_code_point, .combined_code_point = code_point }); +} + static ErrorOr parse_block_display_names(Core::InputBufferedFile& file, UnicodeData& unicode_data) { Array buffer; @@ -663,6 +688,7 @@ static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa Optional code_point_range_start; auto& assigned_code_points = unicode_data.prop_list.find("Assigned"sv)->value; + auto const& full_composition_exclusion_code_points = unicode_data.prop_list.find("Full_Composition_Exclusion"sv)->value; Optional assigned_code_point_range_start = 0; u32 previous_code_point = 0; @@ -741,6 +767,8 @@ static ErrorOr parse_unicode_data(Core::InputBufferedFile& file, UnicodeDa } unicode_data.code_points_with_decomposition_mapping += data.decomposition_mapping.has_value(); + if (data.decomposition_mapping.has_value()) + add_composition_mapping(data.code_point, *data.decomposition_mapping, unicode_data, full_composition_exclusion_code_points); unicode_data.bidirectional_classes.set(data.bidi_class, AK::HashSetExistingEntryBehavior::Keep); @@ -854,6 +882,12 @@ struct CodePointDecomposition { ReadonlySpan decomposition; }; +struct CodePointCompositionRaw { + u32 code_point { 0 }; + u32 second_code_point { 0 }; + u32 combined_code_point { 0 }; +}; + Optional locale_from_string(StringView locale); ReadonlySpan special_case_mapping(u32 code_point); @@ -1075,6 +1109,37 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { { append_code_point_mappings("abbreviation"sv, "CodePointAbbreviation"sv, unicode_data.code_point_abbreviations.size(), [](auto const& data) { return data.abbreviation; }); append_code_point_mappings("decomposition"sv, "CodePointDecompositionRaw"sv, unicode_data.code_points_with_decomposition_mapping, [](auto const& data) { return data.decomposition_mapping; }); + size_t composition_mappings_size = 0; + for (auto const& entry : unicode_data.composition_mappings) + composition_mappings_size += entry.value.size(); + generator.set("composition_mappings_size", ByteString::number(composition_mappings_size)); + generator.append(R"~~~( +static constexpr Array s_composition_mappings { { + )~~~"); + constexpr size_t max_mappings_per_row = 40; + size_t mappings_in_current_row = 0; + auto first_code_points = unicode_data.composition_mappings.keys(); + quick_sort(first_code_points); + for (auto const first_code_point : first_code_points) { + for (auto const& mapping : unicode_data.composition_mappings.find(first_code_point)->value) { + if (mappings_in_current_row++ > 0) + generator.append(" "); + + generator.set("code_point", ByteString::formatted("{:#x}", first_code_point)); + generator.set("second_code_point", ByteString::formatted("{:#x}", mapping.second_code_point)); + generator.set("combined_code_point", ByteString::formatted("{:#x}", mapping.combined_code_point)); + generator.append("{ @code_point@, @second_code_point@, @combined_code_point@ },"); + + if (mappings_in_current_row == max_mappings_per_row) { + mappings_in_current_row = 0; + generator.append("\n "); + } + } + } + generator.append(R"~~~( +} }; +)~~~"); + auto append_casing_table = [&](auto collection_snake, auto const& unique_properties) -> ErrorOr { generator.set("name", TRY(String::formatted("{}_unique_properties", collection_snake))); generator.set("size", TRY(String::number(unique_properties.size()))); @@ -1365,12 +1430,18 @@ Optional code_point_decomposition(u32 code_point) return CodePointDecomposition { mapping->code_point, mapping->tag, ReadonlySpan { s_decomposition_mappings_data.data() + mapping->decomposition_index, mapping->decomposition_count } }; } -Optional code_point_decomposition_by_index(size_t index) +Optional code_point_composition(u32 first_code_point, u32 second_code_point) { - if (index >= s_decomposition_mappings.size()) + size_t mapping_index; + if (!binary_search(s_composition_mappings, first_code_point, &mapping_index, CodePointComparator {})) return {}; - auto const& mapping = s_decomposition_mappings[index]; - return CodePointDecomposition { mapping.code_point, mapping.tag, ReadonlySpan { s_decomposition_mappings_data.data() + mapping.decomposition_index, mapping.decomposition_count } }; + while (mapping_index > 0 && s_composition_mappings[mapping_index - 1].code_point == first_code_point) + mapping_index--; + for (; mapping_index < s_composition_mappings.size() && s_composition_mappings[mapping_index].code_point == first_code_point; ++mapping_index) { + if (s_composition_mappings[mapping_index].second_code_point == second_code_point) + return s_composition_mappings[mapping_index].combined_code_point; + } + return {}; } Optional bidirectional_class(u32 code_point) diff --git a/Userland/Libraries/LibUnicode/Normalize.cpp b/Userland/Libraries/LibUnicode/Normalize.cpp index c68dcc2b465..fbc2779ab90 100644 --- a/Userland/Libraries/LibUnicode/Normalize.cpp +++ b/Userland/Libraries/LibUnicode/Normalize.cpp @@ -20,7 +20,7 @@ struct Unicode::CodePointDecomposition { }; namespace Unicode { Optional __attribute__((weak)) code_point_decomposition(u32) { return {}; } -Optional __attribute__((weak)) code_point_decomposition_by_index(size_t) { return {}; } +Optional __attribute__((weak)) code_point_composition(u32, u32) { return {}; } NormalizationForm normalization_form_from_string(StringView form) { @@ -126,20 +126,9 @@ static u32 combine_hangul_code_points(u32 a, u32 b) static u32 combine_code_points([[maybe_unused]] u32 a, [[maybe_unused]] u32 b) { #if ENABLE_UNICODE_DATA - Array const points { a, b }; - - // FIXME: Do something better than linear search to find reverse mappings. - for (size_t index = 0;; ++index) { - auto mapping_maybe = Unicode::code_point_decomposition_by_index(index); - if (!mapping_maybe.has_value()) - break; - auto& mapping = mapping_maybe.value(); - if (mapping.tag == CompatibilityFormattingTag::Canonical && mapping.decomposition == points) { - if (code_point_has_property(mapping.code_point, Property::Full_Composition_Exclusion)) - continue; - return mapping.code_point; - } - } + auto composition = code_point_composition(a, b); + if (composition.has_value()) + return composition.value(); #endif return 0; diff --git a/Userland/Libraries/LibUnicode/Normalize.h b/Userland/Libraries/LibUnicode/Normalize.h index 17b34550962..73dbc21373e 100644 --- a/Userland/Libraries/LibUnicode/Normalize.h +++ b/Userland/Libraries/LibUnicode/Normalize.h @@ -17,7 +17,7 @@ namespace Unicode { Optional code_point_decomposition(u32 code_point); -Optional code_point_decomposition_by_index(size_t index); +Optional code_point_composition(u32 first_code_point, u32 second_code_point); enum class NormalizationForm { NFD,