Meta: Do not hard-code index types for UCD/CLDR/TZDB code generators

Hand-picking the smallest index type that fits a particular generated
array started with commit 3ad159537e. This
was to reduce the size of the generated library.

Since then, the number of types using UniqueStorage has grown a ton,
creating a long list of types for which index types are manually picked.
When a new UCD/CLDR/TZDB is released, and the current index type no
longer fits the generated data, we fail to generate. Tracking down which
index caused the failure is a pretty annoying process.

Instead, we can just use size_t while in the generators themselves, then
automatically pick the size needed for the generated code.
This commit is contained in:
Timothy Flynn 2022-11-18 11:04:33 -05:00 committed by Linus Groh
parent fa2579ffa9
commit b2164ad979
Notes: sideshowbarker 2024-07-17 07:31:31 +09:00
9 changed files with 268 additions and 379 deletions

View file

@ -20,9 +20,6 @@
#include <LibCore/ArgsParser.h>
#include <LibCore/Stream.h>
using StringIndexType = u16;
constexpr auto s_string_index_type = "u16"sv;
// Some code points are excluded from UnicodeData.txt, and instead are part of a "range" of code
// points, as indicated by the "name" field. For example:
// 3400;<CJK Ideograph Extension A, First>;Lo;0;L;;;;;N;;;;;
@ -74,7 +71,7 @@ using NormalizationProps = HashMap<String, Vector<Normalization>>;
struct CodePointName {
CodePointRange code_point_range;
StringIndexType name { 0 };
size_t name { 0 };
};
// UnicodeData source: https://www.unicode.org/Public/13.0.0/ucd/UnicodeData.txt
@ -83,7 +80,7 @@ struct CodePointName {
struct CodePointData {
u32 code_point { 0 };
String name;
Optional<StringIndexType> abbreviation;
Optional<size_t> abbreviation;
u8 canonical_combining_class { 0 };
String bidi_class;
Optional<CodePointDecomposition> decomposition_mapping;
@ -101,11 +98,11 @@ struct CodePointData {
struct BlockName {
CodePointRange code_point_range;
StringIndexType name { 0 };
size_t name { 0 };
};
struct UnicodeData {
UniqueStringStorage<StringIndexType> unique_strings;
UniqueStringStorage unique_strings;
u32 code_points_with_non_zero_combining_class { 0 };
@ -125,8 +122,8 @@ struct UnicodeData {
Vector<CodePointData> code_point_data;
HashMap<u32, StringIndexType> code_point_abbreviations;
HashMap<u32, StringIndexType> code_point_display_name_aliases;
HashMap<u32, size_t> code_point_abbreviations;
HashMap<u32, size_t> code_point_display_name_aliases;
Vector<CodePointName> code_point_display_names;
PropList general_categories;
@ -795,7 +792,7 @@ static ErrorOr<void> generate_unicode_data_implementation(Core::Stream::Buffered
StringBuilder builder;
SourceGenerator generator { builder };
generator.set("string_index_type"sv, s_string_index_type);
generator.set("string_index_type"sv, unicode_data.unique_strings.type_that_fits());
generator.set("largest_special_casing_size", String::number(unicode_data.largest_special_casing_size));
generator.set("special_casing_size", String::number(unicode_data.special_casing.size()));
@ -947,7 +944,7 @@ static constexpr Array<@mapping_type@, @size@> s_@name@_mappings { {
generator.set("code_point", String::formatted("{:#x}", data.code_point));
generator.append("{ @code_point@");
if constexpr (IsSame<decltype(mapping), Optional<u32>> || IsSame<decltype(mapping), Optional<StringIndexType>>) {
if constexpr (IsSame<decltype(mapping), Optional<u32>> || IsSame<decltype(mapping), Optional<size_t>>) {
generator.set("mapping", String::formatted("{:#x}", *mapping));
generator.append(", @mapping@ },");
} else if constexpr (IsSame<decltype(mapping), Optional<CodePointDecomposition>>) {