LibUnicode: Parse compact identifiers and replace them with a format key

For example, in en-US, the decimal, long compact pattern for numbers
between 10,000 and 100,000 is "00 thousand". In that pattern, "thousand"
is the compact identifier, and the generated format pattern is now
"{number} {compactIdentifier}". This also generates that identifier as
its own field in the NumberFormat structure.
This commit is contained in:
Timothy Flynn 2021-11-15 07:42:39 -05:00 committed by Linus Groh
parent 1533123263
commit 48d5684780
Notes: sideshowbarker 2024-07-18 01:05:15 +09:00
2 changed files with 54 additions and 8 deletions

View file

@ -17,6 +17,7 @@
#include <AK/SourceGenerator.h>
#include <AK/String.h>
#include <AK/StringBuilder.h>
#include <AK/Utf8View.h>
#include <LibCore/ArgsParser.h>
#include <LibCore/DirIterator.h>
#include <LibCore/File.h>
@ -26,6 +27,11 @@
using StringIndexType = u16;
constexpr auto s_string_index_type = "u16"sv;
enum class NumberFormatType {
Standard,
Compact,
};
struct NumberFormat : public Unicode::NumberFormat {
using Base = Unicode::NumberFormat;
@ -51,6 +57,7 @@ struct NumberFormat : public Unicode::NumberFormat {
StringIndexType zero_format_index { 0 };
StringIndexType positive_format_index { 0 };
StringIndexType negative_format_index { 0 };
StringIndexType compact_identifier_index { 0 };
};
struct NumberSystem {
@ -83,7 +90,7 @@ struct UnicodeLocaleData {
Vector<String> numeric_symbols;
};
static void parse_number_pattern(String pattern, UnicodeLocaleData& locale_data, NumberFormat& format, NumberSystem* number_system_for_groupings = nullptr)
static void parse_number_pattern(String pattern, UnicodeLocaleData& locale_data, NumberFormatType type, NumberFormat& format, NumberSystem* number_system_for_groupings = nullptr)
{
// https://unicode.org/reports/tr35/tr35-numbers.html#Number_Format_Patterns
// https://cldr.unicode.org/translation/number-currency-formats/number-and-currency-patterns
@ -138,6 +145,41 @@ static void parse_number_pattern(String pattern, UnicodeLocaleData& locale_data,
pattern = pattern.replace("0"sv, "{scientificExponent}"sv);
}
if (type == NumberFormatType::Compact) {
static Utf8View whitespace { "\u0020\u00a0"sv };
Utf8View utf8_pattern { pattern };
Optional<size_t> start_compact_index;
Optional<size_t> end_compact_index;
bool inside_replacement = false;
for (auto it = utf8_pattern.begin(); it != utf8_pattern.end(); ++it) {
if (*it == '{') {
if (start_compact_index.has_value()) {
end_compact_index = utf8_pattern.byte_offset_of(it);
break;
}
inside_replacement = true;
} else if (*it == '}') {
inside_replacement = false;
} else if (!inside_replacement && !start_compact_index.has_value() && !whitespace.contains(*it)) {
start_compact_index = utf8_pattern.byte_offset_of(it);
}
}
if (!start_compact_index.has_value())
return pattern;
utf8_pattern = utf8_pattern.substring_view(*start_compact_index, end_compact_index.value_or(pattern.length()) - *start_compact_index);
utf8_pattern = utf8_pattern.trim(whitespace);
auto identifier = utf8_pattern.as_string().replace("'.'"sv, "."sv);
format.compact_identifier_index = locale_data.unique_strings.ensure(move(identifier));
pattern = pattern.replace(utf8_pattern.as_string(), "{compactIdentifier}");
}
return pattern;
};
@ -206,7 +248,7 @@ static void parse_number_systems(String locale_numbers_path, UnicodeLocaleData&
}
format.plurality = NumberFormat::plurality_from_string(split_key[2]);
parse_number_pattern(value.as_string(), locale_data, format);
parse_number_pattern(value.as_string(), locale_data, NumberFormatType::Compact, format);
result.append(move(format));
});
@ -237,7 +279,7 @@ static void parse_number_systems(String locale_numbers_path, UnicodeLocaleData&
auto& number_system = ensure_number_system(system);
auto format_object = value.as_object().get("standard"sv);
parse_number_pattern(format_object.as_string(), locale_data, number_system.decimal_format, &number_system);
parse_number_pattern(format_object.as_string(), locale_data, NumberFormatType::Standard, number_system.decimal_format, &number_system);
auto const& long_format = value.as_object().get("long"sv).as_object().get("decimalFormat"sv);
number_system.decimal_long_formats = parse_number_format(long_format.as_object());
@ -249,10 +291,10 @@ static void parse_number_systems(String locale_numbers_path, UnicodeLocaleData&
auto& number_system = ensure_number_system(system);
auto format_object = value.as_object().get("standard"sv);
parse_number_pattern(format_object.as_string(), locale_data, number_system.currency_format);
parse_number_pattern(format_object.as_string(), locale_data, NumberFormatType::Standard, number_system.currency_format);
format_object = value.as_object().get("accounting"sv);
parse_number_pattern(format_object.as_string(), locale_data, number_system.accounting_format);
parse_number_pattern(format_object.as_string(), locale_data, NumberFormatType::Standard, number_system.accounting_format);
number_system.currency_unit_formats = parse_number_format(value.as_object());
@ -265,13 +307,13 @@ static void parse_number_systems(String locale_numbers_path, UnicodeLocaleData&
auto& number_system = ensure_number_system(system);
auto format_object = value.as_object().get("standard"sv);
parse_number_pattern(format_object.as_string(), locale_data, number_system.percent_format);
parse_number_pattern(format_object.as_string(), locale_data, NumberFormatType::Standard, number_system.percent_format);
} else if (key.starts_with(scientific_formats_prefix)) {
auto system = key.substring(scientific_formats_prefix.length());
auto& number_system = ensure_number_system(system);
auto format_object = value.as_object().get("standard"sv);
parse_number_pattern(format_object.as_string(), locale_data, number_system.scientific_format);
parse_number_pattern(format_object.as_string(), locale_data, NumberFormatType::Standard, number_system.scientific_format);
}
});
}
@ -387,6 +429,7 @@ struct NumberFormat {
number_format.zero_format = s_string_list[zero_format];
number_format.positive_format = s_string_list[positive_format];
number_format.negative_format = s_string_list[negative_format];
number_format.compact_identifier = s_string_list[compact_identifier];
return number_format;
}
@ -397,6 +440,7 @@ struct NumberFormat {
@string_index_type@ zero_format { 0 };
@string_index_type@ positive_format { 0 };
@string_index_type@ negative_format { 0 };
@string_index_type@ compact_identifier { 0 };
};
struct NumberSystem {
@ -427,7 +471,8 @@ struct NumberSystem {
generator.set("zero_format"sv, String::number(number_format.zero_format_index));
generator.set("positive_format"sv, String::number(number_format.positive_format_index));
generator.set("negative_format"sv, String::number(number_format.negative_format_index));
generator.append("{ @magnitude@, @compact_scale@, @plurality@, @zero_format@, @positive_format@, @negative_format@ },");
generator.set("compact_identifier"sv, String::number(number_format.compact_identifier_index));
generator.append("{ @magnitude@, @compact_scale@, @plurality@, @zero_format@, @positive_format@, @negative_format@, @compact_identifier@ },");
};
auto append_number_formats = [&](String name, auto const& number_formats) {

View file

@ -122,6 +122,7 @@ struct NumberFormat {
StringView zero_format {};
StringView positive_format {};
StringView negative_format {};
StringView compact_identifier {};
};
struct ListPatterns {