From 9724a25daf4e1548880ca0d2f42b54258782c926 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Sat, 8 Jun 2024 11:22:05 -0400 Subject: [PATCH] LibJS+LibLocale: Replace canonical locales and display names with ICU Note: We keep locale parsing and syntactic validation as-is. ECMA-402 places additional restrictions on locales above what is required by the Unicode spec. ICU doesn't provide methods that let us easily check those restrictions, whereas LibLocale does. Other browsers also implement their own validators here. This introduces a locale cache to re-use parsed locale data and various related structures (not doing so has a non-negligible performance impact on Intl tests). The existing APIs for canonicalization and display names are pretty intertwined, so they must both be adapted at once here. The results of canonicalization are slightly different on some edge cases. But the changed results are actually now aligned with Chrome and Safari. --- Meta/CMake/locale_data.cmake | 2 +- .../LibLocale/GenerateLocaleData.cpp | 748 +----------------- Tests/LibLocale/CMakeLists.txt | 1 + Tests/LibLocale/TestDisplayNames.cpp | 82 ++ Tests/LibLocale/TestLocale.cpp | 85 +- .../LibJS/Runtime/Intl/AbstractOperations.cpp | 123 ++- .../LibJS/Runtime/Intl/AbstractOperations.h | 6 +- .../LibJS/Runtime/Intl/DisplayNames.cpp | 31 +- .../LibJS/Runtime/Intl/DisplayNames.h | 24 +- .../Runtime/Intl/DisplayNamesPrototype.cpp | 52 +- .../Libraries/LibJS/Runtime/Intl/Intl.cpp | 15 +- .../LibJS/Runtime/Intl/LocaleConstructor.cpp | 13 +- .../LibJS/Runtime/Intl/NumberFormat.cpp | 9 +- .../LibJS/Runtime/Intl/NumberFormat.h | 4 +- .../builtins/Intl/Intl.getCanonicalLocales.js | 4 +- Userland/Libraries/LibLocale/CMakeLists.txt | 2 + Userland/Libraries/LibLocale/DisplayNames.cpp | 242 ++++++ Userland/Libraries/LibLocale/DisplayNames.h | 32 + Userland/Libraries/LibLocale/Forward.h | 8 +- Userland/Libraries/LibLocale/ICU.cpp | 104 +++ Userland/Libraries/LibLocale/ICU.h | 67 ++ Userland/Libraries/LibLocale/Locale.cpp | 363 ++------- Userland/Libraries/LibLocale/Locale.h | 37 +- 23 files changed, 693 insertions(+), 1361 deletions(-) create mode 100644 Tests/LibLocale/TestDisplayNames.cpp create mode 100644 Userland/Libraries/LibLocale/DisplayNames.cpp create mode 100644 Userland/Libraries/LibLocale/DisplayNames.h create mode 100644 Userland/Libraries/LibLocale/ICU.cpp create mode 100644 Userland/Libraries/LibLocale/ICU.h diff --git a/Meta/CMake/locale_data.cmake b/Meta/CMake/locale_data.cmake index 618f8ebed5c..9ef4b83f35d 100644 --- a/Meta/CMake/locale_data.cmake +++ b/Meta/CMake/locale_data.cmake @@ -75,7 +75,7 @@ if (ENABLE_UNICODE_DATABASE_DOWNLOAD) "${CLDR_VERSION_FILE}" "${LOCALE_DATA_HEADER}" "${LOCALE_DATA_IMPLEMENTATION}" - arguments -b "${CLDR_BCP47_PATH}" -r "${CLDR_CORE_PATH}" -l "${CLDR_LOCALES_PATH}" -m "${CLDR_MISC_PATH}" -n "${CLDR_NUMBERS_PATH}" -d "${CLDR_DATES_PATH}" + arguments -b "${CLDR_BCP47_PATH}" -r "${CLDR_CORE_PATH}" -m "${CLDR_MISC_PATH}" -n "${CLDR_NUMBERS_PATH}" -d "${CLDR_DATES_PATH}" ) invoke_generator( "NumberFormatData" diff --git a/Meta/Lagom/Tools/CodeGenerators/LibLocale/GenerateLocaleData.cpp b/Meta/Lagom/Tools/CodeGenerators/LibLocale/GenerateLocaleData.cpp index 426816e8440..65e220a3c3e 100644 --- a/Meta/Lagom/Tools/CodeGenerators/LibLocale/GenerateLocaleData.cpp +++ b/Meta/Lagom/Tools/CodeGenerators/LibLocale/GenerateLocaleData.cpp @@ -33,38 +33,6 @@ static ByteString format_identifier(StringView owner, ByteString identifier) return identifier; } -struct DisplayPattern { - unsigned hash() const - { - return pair_int_hash(locale_pattern, locale_separator); - } - - bool operator==(DisplayPattern const& other) const - { - return (locale_pattern == other.locale_pattern) - && (locale_separator == other.locale_separator); - } - - size_t locale_pattern { 0 }; - size_t locale_separator { 0 }; -}; - -template<> -struct AK::Formatter : Formatter { - ErrorOr format(FormatBuilder& builder, DisplayPattern const& patterns) - { - return Formatter::format(builder, - "{{ {}, {} }}"sv, - patterns.locale_pattern, - patterns.locale_separator); - } -}; - -template<> -struct AK::Traits : public DefaultTraits { - static unsigned hash(DisplayPattern const& p) { return p.hash(); } -}; - struct ListPatterns { unsigned hash() const { @@ -143,31 +111,10 @@ struct AK::Traits : public DefaultTraits { static unsigned hash(TextLayout const& t) { return t.hash(); } }; -using LanguageList = Vector; -using TerritoryList = Vector; -using ScriptList = Vector; -using CurrencyList = Vector; -using CalendarList = Vector; -using DateFieldList = Vector; using KeywordList = Vector; using ListPatternList = Vector; struct LocaleData { - ByteString language; - Optional territory; - Optional variant; - size_t display_patterns { 0 }; - size_t languages { 0 }; - size_t territories { 0 }; - size_t scripts { 0 }; - size_t long_currencies { 0 }; - size_t short_currencies { 0 }; - size_t narrow_currencies { 0 }; - size_t numeric_currencies { 0 }; - size_t calendars { 0 }; - size_t long_date_fields { 0 }; - size_t short_date_fields { 0 }; - size_t narrow_date_fields { 0 }; size_t calendar_keywords { 0 }; size_t collation_case_keywords { 0 }; size_t collation_numeric_keywords { 0 }; @@ -183,13 +130,6 @@ struct LanguageMapping { struct CLDR { UniqueStringStorage unique_strings; - UniqueStorage unique_display_patterns; - UniqueStorage unique_language_lists; - UniqueStorage unique_territory_lists; - UniqueStorage unique_script_lists; - UniqueStorage unique_currency_lists; - UniqueStorage unique_calendar_lists; - UniqueStorage unique_date_field_lists; UniqueStorage unique_keyword_lists; UniqueStorage unique_list_patterns; UniqueStorage unique_list_pattern_lists; @@ -198,44 +138,12 @@ struct CLDR { HashMap locales; Vector locale_aliases; - Vector languages; - HashMap language_indices; - - Vector territories; - HashMap territory_indices; - - Vector scripts; - HashMap script_indices; - - Vector variants; - HashMap variant_indices; - - Vector currencies; - HashMap currency_indices; - - Vector date_fields; - HashMap date_fields_indices; - - Vector date_field_aliases { - // ECMA-402 and the CLDR refer to some date fields with different names. Defining these aliases - // means we can remain agnostic about the naming differences elsewhere. - { "dayperiod"sv, "dayPeriod"sv }, - { "week"sv, "weekOfYear"sv }, - { "zone"sv, "timeZoneName"sv }, - }; - HashMap> keywords; HashMap> keyword_aliases; HashMap keyword_names; Vector list_pattern_types; Vector character_orders; - HashMap language_aliases; - HashMap territory_aliases; - HashMap script_aliases; - HashMap variant_aliases; - HashMap subdivision_aliases; - Vector complex_mappings; Vector likely_subtags; size_t max_variant_size { 0 }; }; @@ -273,40 +181,6 @@ static ErrorOr parse_language_mapping(CLDR& cldr, StringView ke return LanguageMapping { move(parsed_key), move(parsed_alias) }; } -static ErrorOr parse_core_aliases(ByteString core_supplemental_path, CLDR& cldr) -{ - LexicalPath core_aliases_path(move(core_supplemental_path)); - core_aliases_path = core_aliases_path.append("aliases.json"sv); - - auto core_aliases = TRY(read_json_file(core_aliases_path.string())); - auto const& supplemental_object = core_aliases.as_object().get_object("supplemental"sv).value(); - auto const& metadata_object = supplemental_object.get_object("metadata"sv).value(); - auto const& alias_object = metadata_object.get_object("alias"sv).value(); - - auto append_aliases = [&](auto& alias_object, auto& alias_map) { - alias_object.for_each_member([&](auto const& key, JsonValue const& value) { - auto alias = value.as_object().get_byte_string("_replacement"sv).value(); - - if (key.contains('-')) { - auto mapping = TRY_OR_DISCARD(parse_language_mapping(cldr, key, alias)); - cldr.max_variant_size = max(mapping.key.variants.size(), cldr.max_variant_size); - cldr.max_variant_size = max(mapping.alias.variants.size(), cldr.max_variant_size); - cldr.complex_mappings.append(move(mapping)); - } else { - alias_map.set(key, cldr.unique_strings.ensure(alias)); - } - }); - }; - - append_aliases(alias_object.get_object("languageAlias"sv).value(), cldr.language_aliases); - append_aliases(alias_object.get_object("territoryAlias"sv).value(), cldr.territory_aliases); - append_aliases(alias_object.get_object("scriptAlias"sv).value(), cldr.script_aliases); - append_aliases(alias_object.get_object("variantAlias"sv).value(), cldr.variant_aliases); - append_aliases(alias_object.get_object("subdivisionAlias"sv).value(), cldr.subdivision_aliases); - - return {}; -} - static ErrorOr parse_likely_subtags(ByteString core_supplemental_path, CLDR& cldr) { LexicalPath likely_subtags_path(move(core_supplemental_path)); @@ -326,149 +200,6 @@ static ErrorOr parse_likely_subtags(ByteString core_supplemental_path, CLD return {}; } -static ErrorOr parse_identity(ByteString locale_path, CLDR& cldr, LocaleData& locale) -{ - LexicalPath locale_display_names_path(move(locale_path)); // Note: Every JSON file defines identity data, so we can use any of them. - locale_display_names_path = locale_display_names_path.append("localeDisplayNames.json"sv); - - auto const& locale_display_names = *TRY(read_json_file_with_cache(locale_display_names_path.string())); - auto const& main_object = locale_display_names.as_object().get_object("main"sv).value(); - auto const& locale_object = main_object.get_object(locale_display_names_path.parent().basename()).value(); - auto const& identity_object = locale_object.get_object("identity"sv).value(); - auto const& language_string = identity_object.get_byte_string("language"sv).value(); - auto const& territory_string = identity_object.get_byte_string("territory"sv); - auto const& script_string = identity_object.get_byte_string("script"sv); - auto const& variant_string = identity_object.get_byte_string("variant"sv); - - locale.language = language_string; - - if (territory_string.has_value()) { - locale.territory = territory_string.value(); - - if (!cldr.territory_indices.contains(*locale.territory)) { - cldr.territory_indices.set(*locale.territory, 0); - cldr.territories.append(*locale.territory); - } - } - - if (script_string.has_value()) { - auto const& script = script_string.value(); - - if (!cldr.script_indices.contains(script)) { - cldr.script_indices.set(script, 0); - cldr.scripts.append(script); - } - } - - if (variant_string.has_value()) { - locale.variant = variant_string.value(); - - if (!cldr.variant_indices.contains(*locale.variant)) { - cldr.variant_indices.set(*locale.variant, 0); - cldr.variants.append(*locale.variant); - } - } - - return {}; -} - -static ErrorOr parse_locale_display_patterns(ByteString locale_path, CLDR& cldr, LocaleData& locale) -{ - LexicalPath locale_display_names_path(move(locale_path)); - locale_display_names_path = locale_display_names_path.append("localeDisplayNames.json"sv); - - auto const& locale_display_names = *TRY(read_json_file_with_cache(locale_display_names_path.string())); - auto const& main_object = locale_display_names.as_object().get_object("main"sv).value(); - auto const& locale_object = main_object.get_object(locale_display_names_path.parent().basename()).value(); - auto const& locale_display_names_object = locale_object.get_object("localeDisplayNames"sv).value(); - auto const& locale_display_patterns_object = locale_display_names_object.get_object("localeDisplayPattern"sv).value(); - auto const& locale_pattern = locale_display_patterns_object.get_byte_string("localePattern"sv).value(); - auto const& locale_separator = locale_display_patterns_object.get_byte_string("localeSeparator"sv).value(); - - DisplayPattern patterns {}; - patterns.locale_pattern = cldr.unique_strings.ensure(locale_pattern); - patterns.locale_separator = cldr.unique_strings.ensure(locale_separator); - - locale.display_patterns = cldr.unique_display_patterns.ensure(move(patterns)); - return {}; -} - -static ErrorOr preprocess_languages(ByteString locale_path, CLDR& cldr) -{ - LexicalPath languages_path(move(locale_path)); - languages_path = languages_path.append("languages.json"sv); - - if (!FileSystem::exists(languages_path.string())) - return {}; - - auto const& locale_languages = *TRY(read_json_file_with_cache(languages_path.string())); - auto const& main_object = locale_languages.as_object().get_object("main"sv).value(); - auto const& locale_object = main_object.get_object(languages_path.parent().basename()).value(); - auto const& locale_display_names_object = locale_object.get_object("localeDisplayNames"sv).value(); - auto const& languages_object = locale_display_names_object.get_object("languages"sv).value(); - - languages_object.for_each_member([&](auto const& key, auto const&) { - if (!key.contains("-alt-"sv) && !cldr.language_indices.contains(key)) { - cldr.language_indices.set(key, 0); - cldr.languages.append(key); - } - }); - - return {}; -} - -static ErrorOr preprocess_currencies(ByteString numbers_path, CLDR& cldr) -{ - LexicalPath currencies_path(move(numbers_path)); - currencies_path = currencies_path.append("currencies.json"sv); - - auto const& locale_currencies = *TRY(read_json_file_with_cache(currencies_path.string())); - auto const& main_object = locale_currencies.as_object().get_object("main"sv).value(); - auto const& locale_object = main_object.get_object(currencies_path.parent().basename()).value(); - auto const& locale_numbers_object = locale_object.get_object("numbers"sv).value(); - auto const& currencies_object = locale_numbers_object.get_object("currencies"sv).value(); - - currencies_object.for_each_member([&](auto const& key, JsonValue const&) { - if (!cldr.currency_indices.contains(key)) { - cldr.currency_indices.set(key, 0); - cldr.currencies.append(key); - } - }); - - return {}; -} - -static bool is_sanctioned_date_field(StringView field) -{ - // This is a copy of the units sanctioned for use within ECMA-402, with names adjusted for the names used by the CLDR. - // https://tc39.es/ecma402/#table-validcodeforDateField - return field.is_one_of("era"sv, "year"sv, "quarter"sv, "month"sv, "week"sv, "weekday"sv, "day"sv, "dayperiod"sv, "hour"sv, "minute"sv, "second"sv, "zone"sv); -} - -static ErrorOr preprocess_date_fields(ByteString dates_path, CLDR& cldr) -{ - LexicalPath date_fields_path(move(dates_path)); - date_fields_path = date_fields_path.append("dateFields.json"sv); - - auto const& locale_date_fields = *TRY(read_json_file_with_cache(date_fields_path.string())); - auto const& main_object = locale_date_fields.as_object().get_object("main"sv).value(); - auto const& locale_object = main_object.get_object(date_fields_path.parent().basename()).value(); - auto const& dates_object = locale_object.get_object("dates"sv).value(); - auto const& fields_object = dates_object.get_object("fields"sv).value(); - - fields_object.for_each_member([&](auto const& key, JsonValue const&) { - if (!is_sanctioned_date_field(key)) - return; - - if (!cldr.date_fields_indices.contains(key)) { - cldr.date_fields_indices.set(key, 0); - cldr.date_fields.append(key); - } - }); - - return {}; -} - static ErrorOr parse_unicode_extension_keywords(ByteString bcp47_path, CLDR& cldr) { constexpr auto desired_keywords = Array { "ca"sv, "co"sv, "hc"sv, "kf"sv, "kn"sv, "nu"sv }; @@ -533,102 +264,6 @@ static Optional find_keyword_alias(StringView key, StringView calend return alias->name; } -static ErrorOr parse_locale_languages(ByteString locale_path, CLDR& cldr, LocaleData& locale) -{ - LexicalPath languages_path(move(locale_path)); - languages_path = languages_path.append("languages.json"sv); - - LanguageList languages; - languages.resize(cldr.languages.size()); - - if (!FileSystem::exists(languages_path.string())) { - for (size_t i = 0; i < languages.size(); ++i) - languages[i] = cldr.unique_strings.ensure(cldr.languages[i]); - - locale.languages = cldr.unique_language_lists.ensure(move(languages)); - return {}; - } - - auto const& locale_languages = *TRY(read_json_file_with_cache(languages_path.string())); - auto const& main_object = locale_languages.as_object().get_object("main"sv).value(); - auto const& locale_object = main_object.get_object(languages_path.parent().basename()).value(); - auto const& locale_display_names_object = locale_object.get_object("localeDisplayNames"sv).value(); - auto const& languages_object = locale_display_names_object.get_object("languages"sv).value(); - - languages_object.for_each_member([&](auto const& key, JsonValue const& value) { - if (key.contains("-alt-"sv)) - return; - - auto index = cldr.language_indices.get(key).value(); - languages[index] = cldr.unique_strings.ensure(value.as_string()); - }); - - locale.languages = cldr.unique_language_lists.ensure(move(languages)); - return {}; -} - -static ErrorOr parse_locale_territories(ByteString locale_path, CLDR& cldr, LocaleData& locale) -{ - LexicalPath territories_path(move(locale_path)); - territories_path = territories_path.append("territories.json"sv); - - TerritoryList territories; - territories.resize(cldr.territories.size()); - - if (!FileSystem::exists(territories_path.string())) { - for (size_t i = 0; i < territories.size(); ++i) - territories[i] = cldr.unique_strings.ensure(cldr.territories[i]); - - locale.territories = cldr.unique_territory_lists.ensure(move(territories)); - return {}; - } - - auto locale_territories = TRY(read_json_file(territories_path.string())); - auto const& main_object = locale_territories.as_object().get_object("main"sv).value(); - auto const& locale_object = main_object.get_object(territories_path.parent().basename()).value(); - auto const& locale_display_names_object = locale_object.get_object("localeDisplayNames"sv).value(); - auto const& territories_object = locale_display_names_object.get_object("territories"sv).value(); - - territories_object.for_each_member([&](auto const& key, JsonValue const& value) { - if (auto index = cldr.territory_indices.get(key); index.has_value()) - territories[*index] = cldr.unique_strings.ensure(value.as_string()); - }); - - locale.territories = cldr.unique_territory_lists.ensure(move(territories)); - return {}; -} - -static ErrorOr parse_locale_scripts(ByteString locale_path, CLDR& cldr, LocaleData& locale) -{ - LexicalPath scripts_path(move(locale_path)); - scripts_path = scripts_path.append("scripts.json"sv); - - ScriptList scripts; - scripts.resize(cldr.scripts.size()); - - if (!FileSystem::exists(scripts_path.string())) { - for (size_t i = 0; i < scripts.size(); ++i) - scripts[i] = cldr.unique_strings.ensure(cldr.scripts[i]); - - locale.scripts = cldr.unique_script_lists.ensure(move(scripts)); - return {}; - } - - auto locale_scripts = TRY(read_json_file(scripts_path.string())); - auto const& main_object = locale_scripts.as_object().get_object("main"sv).value(); - auto const& locale_object = main_object.get_object(scripts_path.parent().basename()).value(); - auto const& locale_display_names_object = locale_object.get_object("localeDisplayNames"sv).value(); - auto const& scripts_object = locale_display_names_object.get_object("scripts"sv).value(); - - scripts_object.for_each_member([&](auto const& key, JsonValue const& value) { - if (auto index = cldr.script_indices.get(key); index.has_value()) - scripts[*index] = cldr.unique_strings.ensure(value.as_string()); - }); - - locale.scripts = cldr.unique_script_lists.ensure(move(scripts)); - return {}; -} - static ErrorOr parse_locale_list_patterns(ByteString misc_path, CLDR& cldr, LocaleData& locale) { LexicalPath list_patterns_path(move(misc_path)); @@ -711,124 +346,6 @@ static ErrorOr parse_locale_layout(ByteString misc_path, CLDR& cldr, Local return {}; } -static ErrorOr parse_locale_currencies(ByteString numbers_path, CLDR& cldr, LocaleData& locale) -{ - LexicalPath currencies_path(move(numbers_path)); - currencies_path = currencies_path.append("currencies.json"sv); - - auto const& locale_currencies = *TRY(read_json_file_with_cache(currencies_path.string())); - auto const& main_object = locale_currencies.as_object().get_object("main"sv).value(); - auto const& locale_object = main_object.get_object(currencies_path.parent().basename()).value(); - auto const& locale_numbers_object = locale_object.get_object("numbers"sv).value(); - auto const& currencies_object = locale_numbers_object.get_object("currencies"sv).value(); - - CurrencyList long_currencies {}; - long_currencies.resize(cldr.currencies.size()); - - CurrencyList short_currencies {}; - short_currencies.resize(cldr.currencies.size()); - - CurrencyList narrow_currencies {}; - narrow_currencies.resize(cldr.currencies.size()); - - CurrencyList numeric_currencies {}; - numeric_currencies.resize(cldr.currencies.size()); - - currencies_object.for_each_member([&](auto const& key, JsonValue const& value) { - auto long_name = value.as_object().get_byte_string("displayName"sv).value_or(key); - auto short_name = value.as_object().get_byte_string("symbol"sv).value_or(key); - auto narrow_name = value.as_object().get_byte_string("symbol-alt-narrow"sv); - auto numeric_name = value.as_object().get_byte_string("displayName-count-other"sv); - - auto index = cldr.currency_indices.get(key).value(); - long_currencies[index] = cldr.unique_strings.ensure(move(long_name)); - short_currencies[index] = cldr.unique_strings.ensure(move(short_name)); - narrow_currencies[index] = narrow_name.has_value() ? cldr.unique_strings.ensure(narrow_name.release_value()) : 0; - numeric_currencies[index] = numeric_name.has_value() ? cldr.unique_strings.ensure(numeric_name.release_value()) : long_currencies[index]; - }); - - locale.long_currencies = cldr.unique_currency_lists.ensure(move(long_currencies)); - locale.short_currencies = cldr.unique_currency_lists.ensure(move(short_currencies)); - locale.narrow_currencies = cldr.unique_currency_lists.ensure(move(narrow_currencies)); - locale.numeric_currencies = cldr.unique_currency_lists.ensure(move(numeric_currencies)); - return {}; -} - -static ErrorOr parse_locale_calendars(ByteString locale_path, CLDR& cldr, LocaleData& locale) -{ - LexicalPath locale_display_names_path(move(locale_path)); - locale_display_names_path = locale_display_names_path.append("localeDisplayNames.json"sv); - - auto const& locale_display_names = *TRY(read_json_file_with_cache(locale_display_names_path.string())); - auto const& main_object = locale_display_names.as_object().get_object("main"sv).value(); - auto const& locale_object = main_object.get_object(locale_display_names_path.parent().basename()).value(); - auto const& locale_display_names_object = locale_object.get_object("localeDisplayNames"sv).value(); - - if (!locale_display_names_object.has_object("types"sv)) - return {}; - - auto const& types_object = locale_display_names_object.get_object("types"sv).value(); - auto const& calendar_object = types_object.get_object("calendar"sv).value(); - - auto const& supported_calendars = cldr.keywords.find("ca"sv)->value; - - CalendarList calendars; - calendars.resize(supported_calendars.size()); - - calendar_object.for_each_member([&](auto const& key, auto const& calendar) { - auto index = supported_calendars.find_first_index(key); - if (!index.has_value()) { - auto alias = find_keyword_alias("ca"sv, key, cldr); - index = supported_calendars.find_first_index(*alias); - } - - calendars[*index] = cldr.unique_strings.ensure(calendar.as_string()); - }); - - locale.calendars = cldr.unique_calendar_lists.ensure(move(calendars)); - return {}; -} - -static ErrorOr parse_locale_date_fields(ByteString dates_path, CLDR& cldr, LocaleData& locale) -{ - LexicalPath date_fields_path(move(dates_path)); - date_fields_path = date_fields_path.append("dateFields.json"sv); - - auto const& locale_date_fields = *TRY(read_json_file_with_cache(date_fields_path.string())); - auto const& main_object = locale_date_fields.as_object().get_object("main"sv).value(); - auto const& locale_object = main_object.get_object(date_fields_path.parent().basename()).value(); - auto const& dates_object = locale_object.get_object("dates"sv).value(); - auto const& fields_object = dates_object.get_object("fields"sv).value(); - - DateFieldList long_date_fields {}; - long_date_fields.resize(cldr.date_fields.size()); - - DateFieldList short_date_fields {}; - short_date_fields.resize(cldr.date_fields.size()); - - DateFieldList narrow_date_fields {}; - narrow_date_fields.resize(cldr.date_fields.size()); - - fields_object.for_each_member([&](auto const& key, JsonValue const& value) { - if (!is_sanctioned_date_field(key)) - return; - - auto const& long_name = value.as_object().get_byte_string("displayName"sv).value(); - auto const& short_name = fields_object.get_object(ByteString::formatted("{}-short", key))->get_byte_string("displayName"sv).value(); - auto const& narrow_name = fields_object.get_object(ByteString::formatted("{}-narrow", key))->get_byte_string("displayName"sv).value(); - - auto index = cldr.date_fields_indices.get(key).value(); - long_date_fields[index] = cldr.unique_strings.ensure(long_name); - short_date_fields[index] = cldr.unique_strings.ensure(short_name); - narrow_date_fields[index] = cldr.unique_strings.ensure(narrow_name); - }); - - locale.long_date_fields = cldr.unique_date_field_lists.ensure(move(long_date_fields)); - locale.short_date_fields = cldr.unique_date_field_lists.ensure(move(short_date_fields)); - locale.narrow_date_fields = cldr.unique_date_field_lists.ensure(move(narrow_date_fields)); - return {}; -} - static ErrorOr parse_number_system_keywords(ByteString locale_numbers_path, CLDR& cldr, LocaleData& locale) { LexicalPath numbers_path(move(locale_numbers_path)); @@ -1008,13 +525,12 @@ static ErrorOr define_aliases_without_scripts(CLDR& cldr) return {}; } -static ErrorOr parse_all_locales(ByteString bcp47_path, ByteString core_path, ByteString locale_names_path, ByteString misc_path, ByteString numbers_path, ByteString dates_path, CLDR& cldr) +static ErrorOr parse_all_locales(ByteString bcp47_path, ByteString core_path, ByteString misc_path, ByteString numbers_path, ByteString dates_path, CLDR& cldr) { LexicalPath core_supplemental_path(core_path); core_supplemental_path = core_supplemental_path.append("supplemental"sv); VERIFY(FileSystem::is_directory(core_supplemental_path.string())); - TRY(parse_core_aliases(core_supplemental_path.string(), cldr)); TRY(parse_likely_subtags(core_supplemental_path.string(), cldr)); auto remove_variants_from_path = [&](ByteString path) -> ErrorOr { @@ -1030,66 +546,12 @@ static ErrorOr parse_all_locales(ByteString bcp47_path, ByteString core_pa return builder.to_byte_string(); }; - TRY(Core::Directory::for_each_entry(TRY(String::formatted("{}/main", locale_names_path)), Core::DirIterator::SkipParentAndBaseDir, [&](auto& entry, auto& directory) -> ErrorOr { - auto locale_path = LexicalPath::join(directory.path().string(), entry.name).string(); - auto language = TRY(remove_variants_from_path(locale_path)); - - auto& locale = cldr.locales.ensure(language); - TRY(parse_identity(locale_path, cldr, locale)); - return IterationDecision::Continue; - })); - - TRY(Core::Directory::for_each_entry(TRY(String::formatted("{}/main", locale_names_path)), Core::DirIterator::SkipParentAndBaseDir, [&](auto& entry, auto& directory) -> ErrorOr { - auto locale_path = LexicalPath::join(directory.path().string(), entry.name).string(); - TRY(preprocess_languages(locale_path, cldr)); - return IterationDecision::Continue; - })); - - TRY(Core::Directory::for_each_entry(TRY(String::formatted("{}/main", numbers_path)), Core::DirIterator::SkipParentAndBaseDir, [&](auto& entry, auto& directory) -> ErrorOr { - auto numbers_path = LexicalPath::join(directory.path().string(), entry.name).string(); - TRY(preprocess_currencies(numbers_path, cldr)); - return IterationDecision::Continue; - })); - - TRY(Core::Directory::for_each_entry(TRY(String::formatted("{}/main", dates_path)), Core::DirIterator::SkipParentAndBaseDir, [&](auto& entry, auto& directory) -> ErrorOr { - auto dates_path = LexicalPath::join(directory.path().string(), entry.name).string(); - TRY(preprocess_date_fields(dates_path, cldr)); - return IterationDecision::Continue; - })); - - auto update_indices = [](auto& keys, auto& indices) { - quick_sort(keys); - - for (size_t i = 0; i < keys.size(); ++i) - indices.set(keys[i], i); - }; - - update_indices(cldr.languages, cldr.language_indices); - update_indices(cldr.territories, cldr.territory_indices); - update_indices(cldr.scripts, cldr.script_indices); - update_indices(cldr.variants, cldr.variant_indices); - update_indices(cldr.currencies, cldr.currency_indices); - update_indices(cldr.date_fields, cldr.date_fields_indices); - TRY(Core::Directory::for_each_entry(TRY(String::formatted("{}/bcp47", bcp47_path)), Core::DirIterator::SkipParentAndBaseDir, [&](auto& entry, auto& directory) -> ErrorOr { auto bcp47_path = LexicalPath::join(directory.path().string(), entry.name).string(); TRY(parse_unicode_extension_keywords(move(bcp47_path), cldr)); return IterationDecision::Continue; })); - TRY(Core::Directory::for_each_entry(TRY(String::formatted("{}/main", locale_names_path)), Core::DirIterator::SkipParentAndBaseDir, [&](auto& entry, auto& directory) -> ErrorOr { - auto locale_path = LexicalPath::join(directory.path().string(), entry.name).string(); - auto language = TRY(remove_variants_from_path(locale_path)); - - auto& locale = cldr.locales.ensure(language); - TRY(parse_locale_display_patterns(locale_path, cldr, locale)); - TRY(parse_locale_languages(locale_path, cldr, locale)); - TRY(parse_locale_territories(locale_path, cldr, locale)); - TRY(parse_locale_scripts(locale_path, cldr, locale)); - TRY(parse_locale_calendars(locale_path, cldr, locale)); - return IterationDecision::Continue; - })); - TRY(Core::Directory::for_each_entry(TRY(String::formatted("{}/main", misc_path)), Core::DirIterator::SkipParentAndBaseDir, [&](auto& entry, auto& directory) -> ErrorOr { auto misc_path = LexicalPath::join(directory.path().string(), entry.name).string(); auto language = TRY(remove_variants_from_path(misc_path)); @@ -1105,7 +567,6 @@ static ErrorOr parse_all_locales(ByteString bcp47_path, ByteString core_pa auto language = TRY(remove_variants_from_path(numbers_path)); auto& locale = cldr.locales.ensure(language); - TRY(parse_locale_currencies(numbers_path, cldr, locale)); TRY(parse_number_system_keywords(numbers_path, cldr, locale)); fill_in_collation_keywords(cldr, locale); return IterationDecision::Continue; @@ -1116,7 +577,6 @@ static ErrorOr parse_all_locales(ByteString bcp47_path, ByteString core_pa auto language = TRY(remove_variants_from_path(dates_path)); auto& locale = cldr.locales.ensure(language); - TRY(parse_locale_date_fields(dates_path, cldr, locale)); TRY(parse_calendar_keywords(dates_path, cldr, locale)); return IterationDecision::Continue; })); @@ -1144,12 +604,6 @@ namespace Locale { auto keywords = cldr.keywords.keys(); generate_enum(generator, format_identifier, "Locale"sv, "None"sv, locales, cldr.locale_aliases); - generate_enum(generator, format_identifier, "Language"sv, {}, cldr.languages); - generate_enum(generator, format_identifier, "Territory"sv, {}, cldr.territories); - generate_enum(generator, format_identifier, "ScriptTag"sv, {}, cldr.scripts); - generate_enum(generator, format_identifier, "Currency"sv, {}, cldr.currencies); - generate_enum(generator, format_identifier, "DateField"sv, {}, cldr.date_fields, cldr.date_field_aliases); - generate_enum(generator, format_identifier, "Variant"sv, {}, cldr.variants); generate_enum(generator, format_identifier, "ListPatternType"sv, {}, cldr.list_pattern_types); generate_enum(generator, format_identifier, "CharacterOrder"sv, {}, cldr.character_orders); generate_enum(generator, format_identifier, "Key"sv, {}, keywords); @@ -1180,7 +634,6 @@ static ErrorOr generate_unicode_locale_implementation(Core::InputBufferedF SourceGenerator generator { builder }; generator.set("string_index_type"sv, string_index_type); generator.set("locales_size"sv, ByteString::number(cldr.locales.size())); - generator.set("territories_size", ByteString::number(cldr.territories.size())); generator.set("variants_size", ByteString::number(cldr.max_variant_size)); generator.append(R"~~~( @@ -1202,20 +655,6 @@ namespace Locale { cldr.unique_strings.generate(generator); generator.append(R"~~~( -struct DisplayPatternImpl { - DisplayPattern to_display_pattern() const - { - DisplayPattern display_patterns {}; - display_patterns.locale_pattern = decode_string(locale_pattern); - display_patterns.locale_separator = decode_string(locale_separator); - - return display_patterns; - } - - @string_index_type@ locale_pattern { 0 }; - @string_index_type@ locale_separator { 0 }; -}; - struct Patterns { ListPatternType type; Style style; @@ -1244,7 +683,6 @@ struct TextLayout { }); generate_available_values(generator, "get_available_hour_cycles"sv, cldr.keywords.find("hc"sv)->value); generate_available_values(generator, "get_available_number_systems"sv, cldr.keywords.find("nu"sv)->value); - generate_available_values(generator, "get_available_currencies"sv, cldr.currencies); generator.append(R"~~~( ReadonlySpan get_available_keyword_values(StringView key) @@ -1272,13 +710,6 @@ ReadonlySpan get_available_keyword_values(StringView key) } )~~~"); - cldr.unique_display_patterns.generate(generator, "DisplayPatternImpl"sv, "s_display_patterns"sv, 30); - cldr.unique_language_lists.generate(generator, string_index_type, "s_language_lists"sv); - cldr.unique_territory_lists.generate(generator, string_index_type, "s_territory_lists"sv); - cldr.unique_script_lists.generate(generator, string_index_type, "s_script_lists"sv); - cldr.unique_currency_lists.generate(generator, string_index_type, "s_currency_lists"sv); - cldr.unique_calendar_lists.generate(generator, string_index_type, "s_calendar_lists"sv); - cldr.unique_date_field_lists.generate(generator, string_index_type, "s_date_field_lists"sv); cldr.unique_keyword_lists.generate(generator, string_index_type, "s_keyword_lists"sv); cldr.unique_list_patterns.generate(generator, "Patterns"sv, "s_list_patterns"sv, 10); cldr.unique_list_pattern_lists.generate(generator, cldr.unique_list_patterns.type_that_fits(), "s_list_pattern_lists"sv); @@ -1328,18 +759,6 @@ static constexpr Array<@type@, @size@> @name@ { {)~~~"); auto locales = cldr.locales.keys(); quick_sort(locales); - append_mapping(locales, cldr.locales, cldr.unique_display_patterns.type_that_fits(), "s_locale_display_patterns"sv, [&](auto const& locale) { return locale.display_patterns; }); - append_mapping(locales, cldr.locales, cldr.unique_language_lists.type_that_fits(), "s_languages"sv, [&](auto const& locale) { return locale.languages; }); - append_mapping(locales, cldr.locales, cldr.unique_territory_lists.type_that_fits(), "s_territories"sv, [&](auto const& locale) { return locale.territories; }); - append_mapping(locales, cldr.locales, cldr.unique_script_lists.type_that_fits(), "s_scripts"sv, [&](auto const& locale) { return locale.scripts; }); - append_mapping(locales, cldr.locales, cldr.unique_currency_lists.type_that_fits(), "s_long_currencies"sv, [&](auto const& locale) { return locale.long_currencies; }); - append_mapping(locales, cldr.locales, cldr.unique_currency_lists.type_that_fits(), "s_short_currencies"sv, [&](auto const& locale) { return locale.short_currencies; }); - append_mapping(locales, cldr.locales, cldr.unique_currency_lists.type_that_fits(), "s_narrow_currencies"sv, [&](auto const& locale) { return locale.narrow_currencies; }); - append_mapping(locales, cldr.locales, cldr.unique_currency_lists.type_that_fits(), "s_numeric_currencies"sv, [&](auto const& locale) { return locale.numeric_currencies; }); - append_mapping(locales, cldr.locales, cldr.unique_calendar_lists.type_that_fits(), "s_calendars"sv, [&](auto const& locale) { return locale.calendars; }); - append_mapping(locales, cldr.locales, cldr.unique_date_field_lists.type_that_fits(), "s_long_date_fields"sv, [&](auto const& locale) { return locale.long_date_fields; }); - append_mapping(locales, cldr.locales, cldr.unique_date_field_lists.type_that_fits(), "s_short_date_fields"sv, [&](auto const& locale) { return locale.short_date_fields; }); - append_mapping(locales, cldr.locales, cldr.unique_date_field_lists.type_that_fits(), "s_narrow_date_fields"sv, [&](auto const& locale) { return locale.narrow_date_fields; }); append_mapping(locales, cldr.locales, cldr.unique_keyword_lists.type_that_fits(), "s_calendar_keywords"sv, [&](auto const& locale) { return locale.calendar_keywords; }); append_mapping(locales, cldr.locales, cldr.unique_keyword_lists.type_that_fits(), "s_collation_case_keywords"sv, [&](auto const& locale) { return locale.collation_case_keywords; }); append_mapping(locales, cldr.locales, cldr.unique_keyword_lists.type_that_fits(), "s_collation_numeric_keywords"sv, [&](auto const& locale) { return locale.collation_numeric_keywords; }); @@ -1351,42 +770,11 @@ static constexpr Array<@type@, @size@> @name@ { {)~~~"); struct CanonicalLanguageID { - LanguageID to_unicode_language_id() const - { - LanguageID language_id {}; - language_id.variants.ensure_capacity(variants_size); - - language_id.language = MUST(String::from_utf8(decode_string(language))); - if (script != 0) - language_id.script = MUST(String::from_utf8(decode_string(script))); - if (region != 0) - language_id.region = MUST(String::from_utf8(decode_string(region))); - for (size_t i = 0; i < variants_size; ++i) - language_id.variants.append(MUST(String::from_utf8(decode_string(variants[i])))); - - return language_id; - } - - bool matches_variants(Vector const& other_variants) const { - if (variants_size == 0) - return true; - if (other_variants.size() != variants_size) - return false; - - for (size_t i = 0; i < variants_size; ++i) { - if (decode_string(variants[i]) != other_variants[i]) - return false; - } - - return true; - }; - @string_index_type@ language { 0 }; @string_index_type@ script { 0 }; @string_index_type@ region { 0 }; Array<@string_index_type@, @variants_size@> variants {}; size_t variants_size { 0 }; - }; struct LanguageMapping { @@ -1436,7 +824,6 @@ static constexpr Array s_@name@ { { generator.append("} };\n"); }; - append_complex_mapping("complex_alias"sv, cldr.complex_mappings); append_complex_mapping("likely_subtags"sv, cldr.likely_subtags); generator.append(R"~~~( @@ -1529,39 +916,6 @@ static LanguageMapping const* resolve_likely_subtag(LanguageID const& language_i )~~~"); - auto append_mapping_search = [&](StringView enum_snake, StringView from_string_name, StringView collection_name, StringView unique_list) { - generator.set("enum_snake", enum_snake); - generator.set("from_string_name", from_string_name); - generator.set("collection_name", collection_name); - generator.set("unique_list", unique_list); - - generator.append(R"~~~( -Optional get_locale_@enum_snake@_mapping(StringView locale, StringView @enum_snake@) -{ - auto locale_value = locale_from_string(locale); - if (!locale_value.has_value()) - return {}; - - auto @enum_snake@_value = @from_string_name@_from_string(@enum_snake@); - if (!@enum_snake@_value.has_value()) - return {}; - - auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None. - auto @enum_snake@_index = to_underlying(*@enum_snake@_value); - - auto mapping_index = @collection_name@.at(locale_index); - auto const& mappings = @unique_list@.at(mapping_index); - - auto @enum_snake@_string_index = mappings.at(@enum_snake@_index); - auto @enum_snake@_mapping = decode_string(@enum_snake@_string_index); - - if (@enum_snake@_mapping.is_empty()) - return {}; - return @enum_snake@_mapping; -} -)~~~"); - }; - auto append_from_string = [&](StringView enum_title, StringView enum_snake, auto const& values, Vector const& aliases = {}) -> ErrorOr { HashValueMap hashes; TRY(hashes.try_ensure_capacity(values.size())); @@ -1576,47 +930,7 @@ Optional get_locale_@enum_snake@_mapping(StringView locale, StringVi return {}; }; - auto append_alias_search = [&](StringView enum_snake, auto const& aliases) -> ErrorOr { - HashValueMap hashes; - TRY(hashes.try_ensure_capacity(aliases.size())); - - for (auto const& alias : aliases) - hashes.set(alias.key.hash(), alias.value); - - ValueFromStringOptions options {}; - options.return_type = "StringView"sv; - options.return_format = "decode_string({})"sv; - - generate_value_from_string(generator, "resolve_{}_alias"sv, string_index_type, enum_snake, move(hashes), options); - - return {}; - }; - TRY(append_from_string("Locale"sv, "locale"sv, cldr.locales.keys(), cldr.locale_aliases)); - - TRY(append_from_string("Language"sv, "language"sv, cldr.languages)); - append_mapping_search("language"sv, "language"sv, "s_languages"sv, "s_language_lists"sv); - TRY(append_alias_search("language"sv, cldr.language_aliases)); - - TRY(append_from_string("Territory"sv, "territory"sv, cldr.territories)); - append_mapping_search("territory"sv, "territory"sv, "s_territories"sv, "s_territory_lists"sv); - TRY(append_alias_search("territory"sv, cldr.territory_aliases)); - - TRY(append_from_string("ScriptTag"sv, "script_tag"sv, cldr.scripts)); - append_mapping_search("script"sv, "script_tag"sv, "s_scripts"sv, "s_script_lists"sv); - TRY(append_alias_search("script_tag"sv, cldr.script_aliases)); - - TRY(append_from_string("Currency"sv, "currency"sv, cldr.currencies)); - append_mapping_search("long_currency"sv, "currency"sv, "s_long_currencies"sv, "s_currency_lists"sv); - append_mapping_search("short_currency"sv, "currency"sv, "s_short_currencies"sv, "s_currency_lists"sv); - append_mapping_search("narrow_currency"sv, "currency"sv, "s_narrow_currencies"sv, "s_currency_lists"sv); - append_mapping_search("numeric_currency"sv, "currency"sv, "s_numeric_currencies"sv, "s_currency_lists"sv); - - TRY(append_from_string("DateField"sv, "date_field"sv, cldr.date_fields, cldr.date_field_aliases)); - append_mapping_search("long_date_field"sv, "date_field"sv, "s_long_date_fields"sv, "s_date_field_lists"sv); - append_mapping_search("short_date_field"sv, "date_field"sv, "s_short_date_fields"sv, "s_date_field_lists"sv); - append_mapping_search("narrow_date_field"sv, "date_field"sv, "s_narrow_date_fields"sv, "s_date_field_lists"sv); - TRY(append_from_string("Key"sv, "key"sv, cldr.keywords.keys())); for (auto const& keyword : cldr.keywords) { @@ -1630,11 +944,6 @@ Optional get_locale_@enum_snake@_mapping(StringView locale, StringVi TRY(append_from_string(enum_name, enum_snake, keyword.value)); } - append_mapping_search("calendar"sv, "keyword_ca"sv, "s_calendars"sv, "s_calendar_lists"sv); - - TRY(append_alias_search("variant"sv, cldr.variant_aliases)); - TRY(append_alias_search("subdivision"sv, cldr.subdivision_aliases)); - TRY(append_from_string("ListPatternType"sv, "list_pattern_type"sv, cldr.list_pattern_types)); TRY(append_from_string("CharacterOrder"sv, "character_order"sv, cldr.character_orders)); @@ -1735,19 +1044,6 @@ Vector get_keywords_for_locale(StringView locale, StringView key) return keywords; } -Optional get_locale_display_patterns(StringView locale) -{ - auto locale_value = locale_from_string(locale); - if (!locale_value.has_value()) - return {}; - - auto locale_index = to_underlying(*locale_value) - 1; // Subtract 1 because 0 == Locale::None. - auto display_patterns_index = s_locale_display_patterns.at(locale_index); - - auto const& display_patterns = s_display_patterns.at(display_patterns_index); - return display_patterns.to_display_pattern(); -} - Optional get_locale_list_patterns(StringView locale, StringView list_pattern_type, Style list_pattern_style) { auto locale_value = locale_from_string(locale); @@ -1798,37 +1094,6 @@ Optional character_order_for_locale(StringView locale) return {}; } -void resolve_complex_language_aliases(LanguageID& language_id) -{ - for (auto const& map : s_complex_alias) { - auto key_language = decode_string(map.key.language); - auto key_script = decode_string(map.key.script); - auto key_region = decode_string(map.key.region); - - if ((key_language != language_id.language) && (key_language != "und"sv)) - continue; - if (!key_script.is_empty() && (key_script != language_id.script)) - continue; - if (!key_region.is_empty() && (key_region != language_id.region)) - continue; - if (!map.key.matches_variants(language_id.variants)) - continue; - - auto alias = map.alias.to_unicode_language_id(); - - if (alias.language == "und"sv) - alias.language = move(language_id.language); - if (key_script.is_empty() && !alias.script.has_value()) - alias.script = move(language_id.script); - if (key_region.is_empty() && !alias.region.has_value()) - alias.region = move(language_id.region); - if (map.key.variants_size == 0 && alias.variants.is_empty()) - alias.variants = move(language_id.variants); - - language_id = move(alias); - break; - } -} Optional add_likely_subtags(LanguageID const& language_id) { @@ -1856,13 +1121,6 @@ Optional add_likely_subtags(LanguageID const& language_id) return maximized; } -Optional resolve_most_likely_territory(LanguageID const& language_id) -{ - if (auto const* likely_subtag = resolve_likely_subtag(language_id); likely_subtag != nullptr) - return MUST(String::from_utf8(decode_string(likely_subtag->alias.region))); - return OptionalNone {}; -} - } )~~~"); @@ -1876,7 +1134,6 @@ ErrorOr serenity_main(Main::Arguments arguments) StringView generated_implementation_path; StringView bcp47_path; StringView core_path; - StringView locale_names_path; StringView misc_path; StringView numbers_path; StringView dates_path; @@ -1886,7 +1143,6 @@ ErrorOr serenity_main(Main::Arguments arguments) args_parser.add_option(generated_implementation_path, "Path to the Unicode locale implementation file to generate", "generated-implementation-path", 'c', "generated-implementation-path"); args_parser.add_option(bcp47_path, "Path to cldr-bcp47 directory", "bcp47-path", 'b', "bcp47-path"); args_parser.add_option(core_path, "Path to cldr-core directory", "core-path", 'r', "core-path"); - args_parser.add_option(locale_names_path, "Path to cldr-localenames directory", "locale-names-path", 'l', "locale-names-path"); args_parser.add_option(misc_path, "Path to cldr-misc directory", "misc-path", 'm', "misc-path"); args_parser.add_option(numbers_path, "Path to cldr-numbers directory", "numbers-path", 'n', "numbers-path"); args_parser.add_option(dates_path, "Path to cldr-dates directory", "dates-path", 'd', "dates-path"); @@ -1896,7 +1152,7 @@ ErrorOr serenity_main(Main::Arguments arguments) auto generated_implementation_file = TRY(open_file(generated_implementation_path, Core::File::OpenMode::Write)); CLDR cldr; - TRY(parse_all_locales(bcp47_path, core_path, locale_names_path, misc_path, numbers_path, dates_path, cldr)); + TRY(parse_all_locales(bcp47_path, core_path, misc_path, numbers_path, dates_path, cldr)); TRY(generate_unicode_locale_header(*generated_header_file, cldr)); TRY(generate_unicode_locale_implementation(*generated_implementation_file, cldr)); diff --git a/Tests/LibLocale/CMakeLists.txt b/Tests/LibLocale/CMakeLists.txt index a2b44e84dd6..e31bf262b4e 100644 --- a/Tests/LibLocale/CMakeLists.txt +++ b/Tests/LibLocale/CMakeLists.txt @@ -1,5 +1,6 @@ set(TEST_SOURCES TestDateTimeFormat.cpp + TestDisplayNames.cpp TestLocale.cpp ) diff --git a/Tests/LibLocale/TestDisplayNames.cpp b/Tests/LibLocale/TestDisplayNames.cpp new file mode 100644 index 00000000000..44df60557df --- /dev/null +++ b/Tests/LibLocale/TestDisplayNames.cpp @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2021, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include + +#include +#include + +TEST_CASE(locale_mappings_en) +{ + auto language = Locale::language_display_name("en"sv, "en"sv, Locale::LanguageDisplay::Standard); + EXPECT(language.has_value()); + EXPECT_EQ(*language, "English"sv); + + language = Locale::language_display_name("en"sv, "i-defintely-don't-exist"sv, Locale::LanguageDisplay::Standard); + EXPECT(!language.has_value()); + + auto territory = Locale::region_display_name("en"sv, "US"sv); + EXPECT(territory.has_value()); + EXPECT_EQ(*territory, "United States"sv); + + territory = Locale::region_display_name("en"sv, "i-defintely-don't-exist"sv); + EXPECT(!territory.has_value()); + + auto script = Locale::script_display_name("en"sv, "Latn"sv); + EXPECT(script.has_value()); + EXPECT_EQ(*script, "Latin"sv); + + script = Locale::script_display_name("en"sv, "i-defintely-don't-exist"sv); + EXPECT(!script.has_value()); +} + +TEST_CASE(locale_mappings_fr) +{ + auto language = Locale::language_display_name("fr"sv, "en"sv, Locale::LanguageDisplay::Standard); + EXPECT(language.has_value()); + EXPECT_EQ(*language, "anglais"sv); + + language = Locale::language_display_name("fr"sv, "i-defintely-don't-exist"sv, Locale::LanguageDisplay::Standard); + EXPECT(!language.has_value()); + + auto territory = Locale::region_display_name("fr"sv, "US"sv); + EXPECT(territory.has_value()); + EXPECT_EQ(*territory, "États-Unis"sv); + + territory = Locale::region_display_name("fr"sv, "i-defintely-don't-exist"sv); + EXPECT(!territory.has_value()); + + auto script = Locale::script_display_name("fr"sv, "Latn"sv); + EXPECT(script.has_value()); + EXPECT_EQ(*script, "latin"sv); + + script = Locale::script_display_name("fr"sv, "i-defintely-don't-exist"sv); + EXPECT(!script.has_value()); +} + +TEST_CASE(locale_mappings_root) +{ + auto language = Locale::language_display_name("und"sv, "en"sv, Locale::LanguageDisplay::Standard); + EXPECT(language.has_value()); + EXPECT_EQ(*language, "en"sv); + + language = Locale::language_display_name("und"sv, "i-defintely-don't-exist"sv, Locale::LanguageDisplay::Standard); + EXPECT(!language.has_value()); + + auto territory = Locale::region_display_name("und"sv, "US"sv); + EXPECT(territory.has_value()); + EXPECT_EQ(*territory, "US"sv); + + territory = Locale::region_display_name("und"sv, "i-defintely-don't-exist"sv); + EXPECT(!territory.has_value()); + + auto script = Locale::script_display_name("und"sv, "Latn"sv); + EXPECT(script.has_value()); + EXPECT_EQ(*script, "Latn"sv); + + script = Locale::script_display_name("und"sv, "i-defintely-don't-exist"sv); + EXPECT(!script.has_value()); +} diff --git a/Tests/LibLocale/TestLocale.cpp b/Tests/LibLocale/TestLocale.cpp index 76458868648..022c47b2198 100644 --- a/Tests/LibLocale/TestLocale.cpp +++ b/Tests/LibLocale/TestLocale.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -338,11 +338,8 @@ TEST_CASE(parse_unicode_locale_id_with_private_use_extension) TEST_CASE(canonicalize_unicode_locale_id) { auto test = [](StringView locale, StringView expected_canonical_locale) { - auto locale_id = Locale::parse_unicode_locale_id(locale); - VERIFY(locale_id.has_value()); - - auto canonical_locale = Locale::canonicalize_unicode_locale_id(*locale_id); - EXPECT_EQ(*canonical_locale, expected_canonical_locale); + auto canonical_locale = Locale::canonicalize_unicode_locale_id(locale); + EXPECT_EQ(canonical_locale, expected_canonical_locale); }; test("aaa"sv, "aaa"sv); @@ -373,8 +370,8 @@ TEST_CASE(canonicalize_unicode_locale_id) test("EN-U-KB-YES"sv, "en-u-kb"sv); test("en-u-kb-yes-abcd"sv, "en-u-kb-yes-abcd"sv); test("EN-U-KB-YES-ABCD"sv, "en-u-kb-yes-abcd"sv); - test("en-u-ka-yes"sv, "en-u-ka-yes"sv); - test("EN-U-KA-YES"sv, "en-u-ka-yes"sv); + test("en-u-ka-yes"sv, "en-u-ka"sv); + test("EN-U-KA-YES"sv, "en-u-ka"sv); test("en-u-1k-names"sv, "en-u-1k-names"sv); test("EN-U-1K-NAMES"sv, "en-u-1k-names"sv); test("en-u-ks-primary"sv, "en-u-ks-level1"sv); @@ -518,75 +515,3 @@ TEST_CASE(supports_locale_aliases) EXPECT(Locale::is_locale_available("zh-TW"sv)); EXPECT(Locale::is_locale_available("zh-Hant-TW"sv)); } - -TEST_CASE(locale_mappings_en) -{ - auto language = Locale::get_locale_language_mapping("en"sv, "en"sv); - EXPECT(language.has_value()); - EXPECT_EQ(*language, "English"sv); - - language = Locale::get_locale_language_mapping("en"sv, "i-defintely-don't-exist"sv); - EXPECT(!language.has_value()); - - auto territory = Locale::get_locale_territory_mapping("en"sv, "US"sv); - EXPECT(territory.has_value()); - EXPECT_EQ(*territory, "United States"sv); - - territory = Locale::get_locale_territory_mapping("en"sv, "i-defintely-don't-exist"sv); - EXPECT(!territory.has_value()); - - auto script = Locale::get_locale_script_mapping("en"sv, "Latn"sv); - EXPECT(script.has_value()); - EXPECT_EQ(*script, "Latin"sv); - - script = Locale::get_locale_script_mapping("en"sv, "i-defintely-don't-exist"sv); - EXPECT(!script.has_value()); -} - -TEST_CASE(locale_mappings_fr) -{ - auto language = Locale::get_locale_language_mapping("fr"sv, "en"sv); - EXPECT(language.has_value()); - EXPECT_EQ(*language, "anglais"sv); - - language = Locale::get_locale_language_mapping("fr"sv, "i-defintely-don't-exist"sv); - EXPECT(!language.has_value()); - - auto territory = Locale::get_locale_territory_mapping("fr"sv, "US"sv); - EXPECT(territory.has_value()); - EXPECT_EQ(*territory, "États-Unis"sv); - - territory = Locale::get_locale_territory_mapping("fr"sv, "i-defintely-don't-exist"sv); - EXPECT(!territory.has_value()); - - auto script = Locale::get_locale_script_mapping("fr"sv, "Latn"sv); - EXPECT(script.has_value()); - EXPECT_EQ(*script, "latin"sv); - - script = Locale::get_locale_script_mapping("fr"sv, "i-defintely-don't-exist"sv); - EXPECT(!script.has_value()); -} - -TEST_CASE(locale_mappings_root) -{ - auto language = Locale::get_locale_language_mapping("und"sv, "en"sv); - EXPECT(language.has_value()); - EXPECT_EQ(*language, "en"sv); - - language = Locale::get_locale_language_mapping("und"sv, "i-defintely-don't-exist"sv); - EXPECT(!language.has_value()); - - auto territory = Locale::get_locale_territory_mapping("und"sv, "US"sv); - EXPECT(territory.has_value()); - EXPECT_EQ(*territory, "US"sv); - - territory = Locale::get_locale_territory_mapping("und"sv, "i-defintely-don't-exist"sv); - EXPECT(!territory.has_value()); - - auto script = Locale::get_locale_script_mapping("und"sv, "Latn"sv); - EXPECT(script.has_value()); - EXPECT_EQ(*script, "Latn"sv); - - script = Locale::get_locale_script_mapping("und"sv, "i-defintely-don't-exist"sv); - EXPECT(!script.has_value()); -} diff --git a/Userland/Libraries/LibJS/Runtime/Intl/AbstractOperations.cpp b/Userland/Libraries/LibJS/Runtime/Intl/AbstractOperations.cpp index 20af3aac1bd..931963d3f7e 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/AbstractOperations.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/AbstractOperations.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -21,7 +21,7 @@ namespace JS::Intl { // 6.2.2 IsStructurallyValidLanguageTag ( locale ), https://tc39.es/ecma402/#sec-isstructurallyvalidlanguagetag -Optional<::Locale::LocaleID> is_structurally_valid_language_tag(StringView locale) +bool is_structurally_valid_language_tag(StringView locale) { auto contains_duplicate_variant = [&](auto& variants) { if (variants.is_empty()) @@ -37,90 +37,78 @@ Optional<::Locale::LocaleID> is_structurally_valid_language_tag(StringView local return false; }; - // IsStructurallyValidLanguageTag returns true if all of the following conditions hold, false otherwise: + // 1. Let lowerLocale be the ASCII-lowercase of locale. + // NOTE: LibLocale's parsing is case-insensitive. - // locale can be generated from the EBNF grammar for unicode_locale_id in Unicode Technical Standard #35 LDML § 3.2 Unicode Locale Identifier; + // 2. If lowerLocale cannot be matched by the unicode_locale_id Unicode locale nonterminal, return false. auto locale_id = ::Locale::parse_unicode_locale_id(locale); if (!locale_id.has_value()) - return {}; + return false; - // locale does not use any of the backwards compatibility syntax described in Unicode Technical Standard #35 LDML § 3.3 BCP 47 Conformance; - // https://unicode.org/reports/tr35/#BCP_47_Conformance + // 3. If lowerLocale uses any of the backwards compatibility syntax described in Unicode Technical Standard #35 Part 1 Core, + // Section 3.3 BCP 47 Conformance, return false. + // https://unicode.org/reports/tr35/#BCP_47_Conformance if (locale.contains('_') || locale_id->language_id.is_root || !locale_id->language_id.language.has_value()) - return {}; + return false; - // the unicode_language_id within locale contains no duplicate unicode_variant_subtag subtags; and - if (contains_duplicate_variant(locale_id->language_id.variants)) - return {}; + // 4. Let languageId be the longest prefix of lowerLocale matched by the unicode_language_id Unicode locale nonterminal. + auto& language_id = locale_id->language_id; - // if locale contains an extensions* component, that component - Vector unique_keys; + // 5. Let variants be GetLocaleVariants(languageId). + // 6. If variants is not undefined, then + if (auto& variants = language_id.variants; !variants.is_empty()) { + // a. If variants contains any duplicate subtags, return false. + if (contains_duplicate_variant(variants)) + return false; + } + + HashTable unique_keys; + + // 7. Let allExtensions be the suffix of lowerLocale following languageId. + // 8. If allExtensions contains a substring matched by the pu_extensions Unicode locale nonterminal, let extensions be + // the prefix of allExtensions preceding the longest such substring. Otherwise, let extensions be allExtensions. + // 9. If extensions is not the empty String, then for (auto& extension : locale_id->extensions) { - // does not contain any other_extensions components with duplicate [alphanum-[tTuUxX]] subtags, - // contains at most one unicode_locale_extensions component, - // contains at most one transformed_extensions component, and char key = extension.visit( [](::Locale::LocaleExtension const&) { return 'u'; }, [](::Locale::TransformedExtension const&) { return 't'; }, [](::Locale::OtherExtension const& ext) { return static_cast(to_ascii_lowercase(ext.key)); }); - if (unique_keys.contains_slow(key)) - return {}; + // a. If extensions contains any duplicate singleton subtags, return false. + if (unique_keys.set(key) != HashSetResult::InsertedNewEntry) + return false; - unique_keys.append(key); - - // if a transformed_extensions component that contains a tlang component is present, then - // the tlang component contains no duplicate unicode_variant_subtag subtags. + // b. Let transformExtension be the longest substring of extensions matched by the transformed_extensions Unicode + // locale nonterminal. If there is no such substring, return true. if (auto* transformed = extension.get_pointer<::Locale::TransformedExtension>()) { - auto& language = transformed->language; - if (language.has_value() && contains_duplicate_variant(language->variants)) - return {}; + // c. Assert: The substring of transformExtension from 0 to 3 is "-t-". + // d. Let tPrefix be the substring of transformExtension from 3. + + // e. Let tlang be the longest prefix of tPrefix matched by the tlang Unicode locale nonterminal. If there is + // no such prefix, return true. + auto& transformed_language = transformed->language; + if (!transformed_language.has_value()) + continue; + + // f. Let tlangRefinements be the longest suffix of tlang following a non-empty prefix matched by the + // unicode_language_subtag Unicode locale nonterminal. + auto& transformed_refinements = transformed_language->variants; + + // g. If tlangRefinements contains any duplicate substrings matched greedily by the unicode_variant_subtag + // Unicode locale nonterminal, return false. + if (contains_duplicate_variant(transformed_refinements)) + return false; } } - return locale_id; + // 10. Return true. + return true; } // 6.2.3 CanonicalizeUnicodeLocaleId ( locale ), https://tc39.es/ecma402/#sec-canonicalizeunicodelocaleid -String canonicalize_unicode_locale_id(::Locale::LocaleID& locale) +String canonicalize_unicode_locale_id(StringView locale) { - // Note: This implementation differs from the spec in how Step 3 is implemented. The spec assumes - // the input to this method is a string, and is written such that operations are performed on parts - // of that string. LibUnicode gives us the parsed locale in a structure, so we can mutate that - // structure directly. From a footnote in the spec: - // - // The third step of this algorithm ensures that a Unicode locale extension sequence in the - // returned language tag contains: - // * only the first instance of any attribute duplicated in the input, and - // * only the first keyword for a given key in the input. - for (auto& extension : locale.extensions) { - if (!extension.has<::Locale::LocaleExtension>()) - continue; - - auto& locale_extension = extension.get<::Locale::LocaleExtension>(); - - auto attributes = move(locale_extension.attributes); - for (auto& attribute : attributes) { - if (!locale_extension.attributes.contains_slow(attribute)) - locale_extension.attributes.append(move(attribute)); - } - - auto keywords = move(locale_extension.keywords); - for (auto& keyword : keywords) { - if (!any_of(locale_extension.keywords, [&](auto const& k) { return k.key == keyword.key; })) - locale_extension.keywords.append(move(keyword)); - } - - break; - } - - // 1. Let localeId be the string locale after performing the algorithm to transform it to canonical syntax per Unicode Technical Standard #35 LDML § 3.2.1 Canonical Unicode Locale Identifiers. - // 2. Let localeId be the string localeId after performing the algorithm to transform it to canonical form. - auto locale_id = ::Locale::canonicalize_unicode_locale_id(locale); - VERIFY(locale_id.has_value()); - - // 4. Return localeId. - return locale_id.release_value(); + return ::Locale::canonicalize_unicode_locale_id(locale); } // 6.3.1 IsWellFormedCurrencyCode ( currency ), https://tc39.es/ecma402/#sec-iswellformedcurrencycode @@ -246,12 +234,11 @@ ThrowCompletionOr> canonicalize_locale_list(VM& vm, Value locales } // v. If ! IsStructurallyValidLanguageTag(tag) is false, throw a RangeError exception. - auto locale_id = is_structurally_valid_language_tag(tag); - if (!locale_id.has_value()) + if (!is_structurally_valid_language_tag(tag)) return vm.throw_completion(ErrorType::IntlInvalidLanguageTag, tag); // vi. Let canonicalizedTag be ! CanonicalizeUnicodeLocaleId(tag). - auto canonicalized_tag = JS::Intl::canonicalize_unicode_locale_id(*locale_id); + auto canonicalized_tag = canonicalize_unicode_locale_id(tag); // vii. If canonicalizedTag is not an element of seen, append canonicalizedTag as the last element of seen. if (!seen.contains_slow(canonicalized_tag)) @@ -355,7 +342,7 @@ String insert_unicode_extension_and_canonicalize(::Locale::LocaleID locale, ::Lo // structure directly. locale.extensions.append(move(extension)); - return JS::Intl::canonicalize_unicode_locale_id(locale); + return JS::Intl::canonicalize_unicode_locale_id(locale.to_string()); } template diff --git a/Userland/Libraries/LibJS/Runtime/Intl/AbstractOperations.h b/Userland/Libraries/LibJS/Runtime/Intl/AbstractOperations.h index a4a3e1a0208..ea1593adf68 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/AbstractOperations.h +++ b/Userland/Libraries/LibJS/Runtime/Intl/AbstractOperations.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -81,8 +81,8 @@ struct PatternPartitionWithSource : public PatternPartition { using StringOrBoolean = Variant; -Optional<::Locale::LocaleID> is_structurally_valid_language_tag(StringView locale); -String canonicalize_unicode_locale_id(::Locale::LocaleID& locale); +bool is_structurally_valid_language_tag(StringView locale); +String canonicalize_unicode_locale_id(StringView locale); bool is_well_formed_currency_code(StringView currency); bool is_well_formed_unit_identifier(StringView unit_identifier); ThrowCompletionOr> canonicalize_locale_list(VM&, Value locales); diff --git a/Userland/Libraries/LibJS/Runtime/Intl/DisplayNames.cpp b/Userland/Libraries/LibJS/Runtime/Intl/DisplayNames.cpp index 37ed147a7b6..a3e5c3923bc 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/DisplayNames.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/DisplayNames.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -78,30 +78,6 @@ StringView DisplayNames::fallback_string() const } } -void DisplayNames::set_language_display(StringView language_display) -{ - if (language_display == "dialect"sv) - m_language_display = LanguageDisplay::Dialect; - else if (language_display == "standard"sv) - m_language_display = LanguageDisplay::Standard; - else - VERIFY_NOT_REACHED(); -} - -StringView DisplayNames::language_display_string() const -{ - VERIFY(m_language_display.has_value()); - - switch (*m_language_display) { - case LanguageDisplay::Dialect: - return "dialect"sv; - case LanguageDisplay::Standard: - return "standard"sv; - default: - VERIFY_NOT_REACHED(); - } -} - // 12.5.1 CanonicalCodeForDisplayNames ( type, code ), https://tc39.es/ecma402/#sec-canonicalcodefordisplaynames ThrowCompletionOr canonical_code_for_display_names(VM& vm, DisplayNames::Type type, StringView code) { @@ -112,12 +88,11 @@ ThrowCompletionOr canonical_code_for_display_names(VM& vm, DisplayNames:: return vm.throw_completion(ErrorType::OptionIsNotValidValue, code, "language"sv); // b. If IsStructurallyValidLanguageTag(code) is false, throw a RangeError exception. - auto locale_id = is_structurally_valid_language_tag(code); - if (!locale_id.has_value()) + if (!is_structurally_valid_language_tag(code)) return vm.throw_completion(ErrorType::IntlInvalidLanguageTag, code); // c. Return ! CanonicalizeUnicodeLocaleId(code). - auto canonicalized_tag = JS::Intl::canonicalize_unicode_locale_id(*locale_id); + auto canonicalized_tag = canonicalize_unicode_locale_id(code); return PrimitiveString::create(vm, move(canonicalized_tag)); } diff --git a/Userland/Libraries/LibJS/Runtime/Intl/DisplayNames.h b/Userland/Libraries/LibJS/Runtime/Intl/DisplayNames.h index b10056ccfcd..905ab331678 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/DisplayNames.h +++ b/Userland/Libraries/LibJS/Runtime/Intl/DisplayNames.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -10,6 +10,7 @@ #include #include #include +#include #include namespace JS::Intl { @@ -34,11 +35,6 @@ class DisplayNames final : public Object { Code, }; - enum class LanguageDisplay { - Dialect, - Standard, - }; - public: virtual ~DisplayNames() override = default; @@ -58,18 +54,18 @@ public: StringView fallback_string() const; bool has_language_display() const { return m_language_display.has_value(); } - LanguageDisplay language_display() const { return *m_language_display; } - void set_language_display(StringView language_display); - StringView language_display_string() const; + ::Locale::LanguageDisplay language_display() const { return *m_language_display; } + void set_language_display(StringView language_display) { m_language_display = ::Locale::language_display_from_string(language_display); } + StringView language_display_string() const { return ::Locale::language_display_to_string(*m_language_display); } private: DisplayNames(Object& prototype); - String m_locale; // [[Locale]] - ::Locale::Style m_style { ::Locale::Style::Long }; // [[Style]] - Type m_type { Type::Invalid }; // [[Type]] - Fallback m_fallback { Fallback::Invalid }; // [[Fallback]] - Optional m_language_display {}; // [[LanguageDisplay]] + String m_locale; // [[Locale]] + ::Locale::Style m_style { ::Locale::Style::Long }; // [[Style]] + Type m_type { Type::Invalid }; // [[Type]] + Fallback m_fallback { Fallback::Invalid }; // [[Fallback]] + Optional<::Locale::LanguageDisplay> m_language_display; // [[LanguageDisplay]] }; ThrowCompletionOr canonical_code_for_display_names(VM&, DisplayNames::Type, StringView code); diff --git a/Userland/Libraries/LibJS/Runtime/Intl/DisplayNamesPrototype.cpp b/Userland/Libraries/LibJS/Runtime/Intl/DisplayNamesPrototype.cpp index c30f3982f1a..89c7aac6263 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/DisplayNamesPrototype.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/DisplayNamesPrototype.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -9,7 +9,7 @@ #include #include #include -#include +#include namespace JS::Intl { @@ -53,58 +53,26 @@ JS_DEFINE_NATIVE_FUNCTION(DisplayNamesPrototype::of) // 5. Let fields be displayNames.[[Fields]]. // 6. If fields has a field [[]], return fields.[[]]. - Optional result; - Optional formatted_result; + Optional result; switch (display_names->type()) { case DisplayNames::Type::Language: - if (display_names->language_display() == DisplayNames::LanguageDisplay::Dialect) { - result = ::Locale::get_locale_language_mapping(display_names->locale(), code_string); - if (result.has_value()) - break; - } - - if (auto locale = is_structurally_valid_language_tag(code_string); locale.has_value()) - formatted_result = ::Locale::format_locale_for_display(display_names->locale(), locale.release_value()); + result = ::Locale::language_display_name(display_names->locale(), code_string, display_names->language_display()); break; case DisplayNames::Type::Region: - result = ::Locale::get_locale_territory_mapping(display_names->locale(), code_string); + result = ::Locale::region_display_name(display_names->locale(), code_string); break; case DisplayNames::Type::Script: - result = ::Locale::get_locale_script_mapping(display_names->locale(), code_string); + result = ::Locale::script_display_name(display_names->locale(), code_string); break; case DisplayNames::Type::Currency: - switch (display_names->style()) { - case ::Locale::Style::Long: - result = ::Locale::get_locale_long_currency_mapping(display_names->locale(), code_string); - break; - case ::Locale::Style::Short: - result = ::Locale::get_locale_short_currency_mapping(display_names->locale(), code_string); - break; - case ::Locale::Style::Narrow: - result = ::Locale::get_locale_narrow_currency_mapping(display_names->locale(), code_string); - break; - default: - VERIFY_NOT_REACHED(); - } + result = ::Locale::currency_display_name(display_names->locale(), code_string, display_names->style()); break; case DisplayNames::Type::Calendar: - result = ::Locale::get_locale_calendar_mapping(display_names->locale(), code_string); + result = ::Locale::calendar_display_name(display_names->locale(), code_string); break; case DisplayNames::Type::DateTimeField: - switch (display_names->style()) { - case ::Locale::Style::Long: - result = ::Locale::get_locale_long_date_field_mapping(display_names->locale(), code_string); - break; - case ::Locale::Style::Short: - result = ::Locale::get_locale_short_date_field_mapping(display_names->locale(), code_string); - break; - case ::Locale::Style::Narrow: - result = ::Locale::get_locale_narrow_date_field_mapping(display_names->locale(), code_string); - break; - default: - VERIFY_NOT_REACHED(); - } + result = ::Locale::date_time_field_display_name(display_names->locale(), code_string, display_names->style()); break; default: VERIFY_NOT_REACHED(); @@ -112,8 +80,6 @@ JS_DEFINE_NATIVE_FUNCTION(DisplayNamesPrototype::of) if (result.has_value()) return PrimitiveString::create(vm, result.release_value()); - if (formatted_result.has_value()) - return PrimitiveString::create(vm, formatted_result.release_value()); // 7. If displayNames.[[Fallback]] is "code", return code. if (display_names->fallback() == DisplayNames::Fallback::Code) diff --git a/Userland/Libraries/LibJS/Runtime/Intl/Intl.cpp b/Userland/Libraries/LibJS/Runtime/Intl/Intl.cpp index 3b12bf94e3a..3102ce80f22 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/Intl.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/Intl.cpp @@ -116,7 +116,7 @@ JS_DEFINE_NATIVE_FUNCTION(Intl::supported_values_of) // 1. Let key be ? ToString(key). auto key = TRY(vm.argument(0).to_string(vm)); - ReadonlySpan list; + Optional, ReadonlySpan>> list; // 2. If key is "calendar", then if (key == "calendar"sv) { @@ -131,7 +131,8 @@ JS_DEFINE_NATIVE_FUNCTION(Intl::supported_values_of) // 4. Else if key is "currency", then else if (key == "currency"sv) { // a. Let list be ! AvailableCanonicalCurrencies( ). - list = ::Locale::get_available_currencies(); + static auto const currencies = ::Locale::available_currencies(); + list = currencies.span(); } // 5. Else if key is "numberingSystem", then else if (key == "numberingSystem"sv) { @@ -141,13 +142,13 @@ JS_DEFINE_NATIVE_FUNCTION(Intl::supported_values_of) // 6. Else if key is "timeZone", then else if (key == "timeZone"sv) { // a. Let list be ! AvailableCanonicalTimeZones( ). - static auto time_zones = available_canonical_time_zones(); + static auto const time_zones = available_canonical_time_zones(); list = time_zones.span(); } // 7. Else if key is "unit", then else if (key == "unit"sv) { // a. Let list be ! AvailableCanonicalUnits( ). - static auto units = sanctioned_single_unit_identifiers(); + static auto const units = sanctioned_single_unit_identifiers(); list = units.span(); } // 8. Else, @@ -157,8 +158,10 @@ JS_DEFINE_NATIVE_FUNCTION(Intl::supported_values_of) } // 9. Return CreateArrayFromList( list ). - return Array::create_from(realm, list, [&](auto value) { - return PrimitiveString::create(vm, value); + return list->visit([&](ReadonlySpan list) { + return Array::create_from(realm, list, [&](auto value) { + return PrimitiveString::create(vm, value); + }); }); } diff --git a/Userland/Libraries/LibJS/Runtime/Intl/LocaleConstructor.cpp b/Userland/Libraries/LibJS/Runtime/Intl/LocaleConstructor.cpp index 3f8a042d4f2..186b3270996 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/LocaleConstructor.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/LocaleConstructor.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -49,8 +49,7 @@ static ThrowCompletionOr apply_options_to_tag(VM& vm, StringView tag, Ob // 2. Assert: Type(options) is Object. // 3. If ! IsStructurallyValidLanguageTag(tag) is false, throw a RangeError exception. - auto locale_id = is_structurally_valid_language_tag(tag); - if (!locale_id.has_value()) + if (!is_structurally_valid_language_tag(tag)) return vm.throw_completion(ErrorType::IntlInvalidLanguageTag, tag); // 4. Let language be ? GetOption(options, "language", string, empty, undefined). @@ -69,10 +68,10 @@ static ThrowCompletionOr apply_options_to_tag(VM& vm, StringView tag, Ob auto region = TRY(get_string_option(vm, options, vm.names.region, ::Locale::is_unicode_region_subtag)); // 10. Set tag to ! CanonicalizeUnicodeLocaleId(tag). - auto canonicalized_tag = JS::Intl::canonicalize_unicode_locale_id(*locale_id); + auto canonicalized_tag = JS::Intl::canonicalize_unicode_locale_id(tag); // 11. Assert: tag matches the unicode_locale_id production. - locale_id = ::Locale::parse_unicode_locale_id(canonicalized_tag); + auto locale_id = ::Locale::parse_unicode_locale_id(canonicalized_tag); VERIFY(locale_id.has_value()); // 12. Let languageId be the substring of tag corresponding to the unicode_language_id production. @@ -103,8 +102,10 @@ static ThrowCompletionOr apply_options_to_tag(VM& vm, StringView tag, Ob } // 16. Set tag to tag with the substring corresponding to the unicode_language_id production replaced by the string languageId. + canonicalized_tag = locale_id->to_string(); + // 17. Return ! CanonicalizeUnicodeLocaleId(tag). - return JS::Intl::canonicalize_unicode_locale_id(*locale_id); + return JS::Intl::canonicalize_unicode_locale_id(canonicalized_tag); } // 14.1.3 ApplyUnicodeExtensionToTag ( tag, options, relevantExtensionKeys ), https://tc39.es/ecma402/#sec-apply-unicode-extension-to-tag diff --git a/Userland/Libraries/LibJS/Runtime/Intl/NumberFormat.cpp b/Userland/Libraries/LibJS/Runtime/Intl/NumberFormat.cpp index b11a53098c1..e03c9a984fb 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/NumberFormat.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/NumberFormat.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -99,13 +100,13 @@ StringView NumberFormat::resolve_currency_display() m_resolved_currency_display = currency(); break; case NumberFormat::CurrencyDisplay::Symbol: - m_resolved_currency_display = ::Locale::get_locale_short_currency_mapping(data_locale(), currency()); + m_resolved_currency_display = ::Locale::currency_display_name(data_locale(), currency(), ::Locale::Style::Short); break; case NumberFormat::CurrencyDisplay::NarrowSymbol: - m_resolved_currency_display = ::Locale::get_locale_narrow_currency_mapping(data_locale(), currency()); + m_resolved_currency_display = ::Locale::currency_display_name(data_locale(), currency(), ::Locale::Style::Narrow); break; case NumberFormat::CurrencyDisplay::Name: - m_resolved_currency_display = ::Locale::get_locale_numeric_currency_mapping(data_locale(), currency()); + m_resolved_currency_display = ::Locale::currency_numeric_display_name(data_locale(), currency()); break; default: VERIFY_NOT_REACHED(); diff --git a/Userland/Libraries/LibJS/Runtime/Intl/NumberFormat.h b/Userland/Libraries/LibJS/Runtime/Intl/NumberFormat.h index 700dcbdd78a..55dc2bc6934 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/NumberFormat.h +++ b/Userland/Libraries/LibJS/Runtime/Intl/NumberFormat.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -268,7 +268,7 @@ private: GCPtr m_bound_format; // [[BoundFormat]] // Non-standard. Stores the resolved currency display string based on [[Locale]], [[Currency]], and [[CurrencyDisplay]]. - Optional m_resolved_currency_display; + Optional m_resolved_currency_display; // Non-standard. Stores the resolved compact number format based on [[Locale]], [[Notation], [[Style]], and [[CompactDisplay]]. Optional<::Locale::NumberFormat> m_compact_format; diff --git a/Userland/Libraries/LibJS/Tests/builtins/Intl/Intl.getCanonicalLocales.js b/Userland/Libraries/LibJS/Tests/builtins/Intl/Intl.getCanonicalLocales.js index bda470aa460..1fde5a9b730 100644 --- a/Userland/Libraries/LibJS/Tests/builtins/Intl/Intl.getCanonicalLocales.js +++ b/Userland/Libraries/LibJS/Tests/builtins/Intl/Intl.getCanonicalLocales.js @@ -88,8 +88,8 @@ describe("normal behavior", () => { expect(Intl.getCanonicalLocales(["EN-US"])).toEqual(["en-US"]); expect(Intl.getCanonicalLocales(["EN-US", "Fr"])).toEqual(["en-US", "fr"]); expect(Intl.getCanonicalLocales("EN-lATN-US")).toEqual(["en-Latn-US"]); - expect(Intl.getCanonicalLocales("EN-US-POSIX")).toEqual(["en-US-posix"]); - expect(Intl.getCanonicalLocales("EN-LATN-US-POSIX")).toEqual(["en-Latn-US-posix"]); + expect(Intl.getCanonicalLocales("EN-US-POSIX")).toEqual(["en-US-u-va-posix"]); + expect(Intl.getCanonicalLocales("EN-LATN-US-POSIX")).toEqual(["en-Latn-US-u-va-posix"]); }); test("duplicate locales", () => { diff --git a/Userland/Libraries/LibLocale/CMakeLists.txt b/Userland/Libraries/LibLocale/CMakeLists.txt index 3f9df1ffcb0..a944cefdc97 100644 --- a/Userland/Libraries/LibLocale/CMakeLists.txt +++ b/Userland/Libraries/LibLocale/CMakeLists.txt @@ -11,6 +11,8 @@ endif() set(SOURCES DateTimeFormat.cpp + DisplayNames.cpp + ICU.cpp Locale.cpp NumberFormat.cpp PluralRules.cpp diff --git a/Userland/Libraries/LibLocale/DisplayNames.cpp b/Userland/Libraries/LibLocale/DisplayNames.cpp new file mode 100644 index 00000000000..a1fa1585f1f --- /dev/null +++ b/Userland/Libraries/LibLocale/DisplayNames.cpp @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2024, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#define AK_DONT_REPLACE_STD + +#include +#include +#include + +#include +#include +#include +#include + +namespace Locale { + +LanguageDisplay language_display_from_string(StringView language_display) +{ + if (language_display == "standard"sv) + return LanguageDisplay::Standard; + if (language_display == "dialect"sv) + return LanguageDisplay::Dialect; + VERIFY_NOT_REACHED(); +} + +StringView language_display_to_string(LanguageDisplay language_display) +{ + switch (language_display) { + case LanguageDisplay::Standard: + return "standard"sv; + case LanguageDisplay::Dialect: + return "dialect"sv; + default: + VERIFY_NOT_REACHED(); + } +} + +Optional language_display_name(StringView locale, StringView language, LanguageDisplay display) +{ + auto locale_data = LocaleData::for_locale(locale); + if (!locale_data.has_value()) + return {}; + + auto language_data = LocaleData::for_locale(language); + if (!language_data.has_value()) + return {}; + + auto& display_names = display == LanguageDisplay::Standard + ? locale_data->standard_display_names() + : locale_data->dialect_display_names(); + + icu::UnicodeString result; + display_names.localeDisplayName(language_data->locale().getName(), result); + + return icu_string_to_string(result); +} + +Optional region_display_name(StringView locale, StringView region) +{ + UErrorCode status = U_ZERO_ERROR; + + auto locale_data = LocaleData::for_locale(locale); + if (!locale_data.has_value()) + return {}; + + auto icu_region = icu::LocaleBuilder().setRegion(icu_string_piece(region)).build(status); + if (icu_failure(status)) + return {}; + + icu::UnicodeString result; + locale_data->standard_display_names().regionDisplayName(icu_region.getCountry(), result); + + return icu_string_to_string(result); +} + +Optional script_display_name(StringView locale, StringView script) +{ + UErrorCode status = U_ZERO_ERROR; + + auto locale_data = LocaleData::for_locale(locale); + if (!locale_data.has_value()) + return {}; + + auto icu_script = icu::LocaleBuilder().setScript(icu_string_piece(script)).build(status); + if (icu_failure(status)) + return {}; + + icu::UnicodeString result; + locale_data->standard_display_names().scriptDisplayName(icu_script.getScript(), result); + + return icu_string_to_string(result); +} + +Optional calendar_display_name(StringView locale, StringView calendar) +{ + auto locale_data = LocaleData::for_locale(locale); + if (!locale_data.has_value()) + return {}; + + if (calendar == "gregory"sv) + calendar = "gregorian"sv; + if (calendar == "islamicc"sv) + calendar = "islamic-civil"sv; + if (calendar == "ethioaa"sv) + calendar = "ethiopic-amete-alem"sv; + + icu::UnicodeString result; + locale_data->standard_display_names().keyValueDisplayName("calendar", ByteString(calendar).characters(), result); + + return icu_string_to_string(result); +} + +static constexpr UDateTimePatternField icu_date_time_field(StringView field) +{ + if (field == "day"sv) + return UDATPG_DAY_FIELD; + if (field == "dayPeriod"sv) + return UDATPG_DAYPERIOD_FIELD; + if (field == "era"sv) + return UDATPG_ERA_FIELD; + if (field == "hour"sv) + return UDATPG_HOUR_FIELD; + if (field == "minute"sv) + return UDATPG_MINUTE_FIELD; + if (field == "month"sv) + return UDATPG_MONTH_FIELD; + if (field == "quarter"sv) + return UDATPG_QUARTER_FIELD; + if (field == "second"sv) + return UDATPG_SECOND_FIELD; + if (field == "timeZoneName"sv) + return UDATPG_ZONE_FIELD; + if (field == "weekOfYear"sv) + return UDATPG_WEEK_OF_YEAR_FIELD; + if (field == "weekday"sv) + return UDATPG_WEEKDAY_FIELD; + if (field == "year"sv) + return UDATPG_YEAR_FIELD; + VERIFY_NOT_REACHED(); +} + +static constexpr UDateTimePGDisplayWidth icu_date_time_style(Style style) +{ + switch (style) { + case Style::Long: + return UDATPG_WIDE; + case Style::Short: + return UDATPG_ABBREVIATED; + case Style::Narrow: + return UDATPG_NARROW; + } + + VERIFY_NOT_REACHED(); +} + +Optional date_time_field_display_name(StringView locale, StringView field, Style style) +{ + auto locale_data = LocaleData::for_locale(locale); + if (!locale_data.has_value()) + return {}; + + auto icu_field = icu_date_time_field(field); + auto icu_style = icu_date_time_style(style); + + icu::UnicodeString result; + result = locale_data->date_time_pattern_generator().getFieldDisplayName(icu_field, icu_style); + + return icu_string_to_string(result); +} + +static constexpr Array icu_currency_code(StringView currency) +{ + VERIFY(currency.length() == 3); + + return to_array({ + static_cast(currency[0]), + static_cast(currency[1]), + static_cast(currency[2]), + u'\0', + }); +} + +static constexpr UCurrNameStyle icu_currency_style(Style style) +{ + switch (style) { + case Style::Long: + return UCURR_LONG_NAME; + case Style::Short: + return UCURR_SYMBOL_NAME; + case Style::Narrow: + return UCURR_NARROW_SYMBOL_NAME; + } + + VERIFY_NOT_REACHED(); +} + +Optional currency_display_name(StringView locale, StringView currency, Style style) +{ + UErrorCode status = U_ZERO_ERROR; + + auto locale_data = LocaleData::for_locale(locale); + if (!locale_data.has_value()) + return {}; + + auto icu_currency = icu_currency_code(currency); + + i32 length = 0; + UChar const* result = ucurr_getName(icu_currency.data(), locale_data->locale().getName(), icu_currency_style(style), nullptr, &length, &status); + + if (icu_failure(status)) + return {}; + if ((status == U_USING_DEFAULT_WARNING) && (result == icu_currency.data())) + return {}; + + return icu_string_to_string(result, length); +} + +Optional currency_numeric_display_name(StringView locale, StringView currency) +{ + UErrorCode status = U_ZERO_ERROR; + + auto locale_data = LocaleData::for_locale(locale); + if (!locale_data.has_value()) + return {}; + + auto icu_currency = icu_currency_code(currency); + + i32 length = 0; + UChar const* result = ucurr_getPluralName(icu_currency.data(), locale_data->locale().getName(), nullptr, "other", &length, &status); + + if (icu_failure(status)) + return {}; + if ((status == U_USING_DEFAULT_WARNING) && (result == icu_currency.data())) + return {}; + + return icu_string_to_string(result, length); +} + +} diff --git a/Userland/Libraries/LibLocale/DisplayNames.h b/Userland/Libraries/LibLocale/DisplayNames.h new file mode 100644 index 00000000000..30b34438622 --- /dev/null +++ b/Userland/Libraries/LibLocale/DisplayNames.h @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2024, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include +#include + +namespace Locale { + +enum class LanguageDisplay { + Standard, + Dialect, +}; + +LanguageDisplay language_display_from_string(StringView language_display); +StringView language_display_to_string(LanguageDisplay language_display); + +Optional language_display_name(StringView locale, StringView language, LanguageDisplay); +Optional region_display_name(StringView locale, StringView region); +Optional script_display_name(StringView locale, StringView script); +Optional calendar_display_name(StringView locale, StringView calendar); +Optional date_time_field_display_name(StringView locale, StringView field, Style); +Optional currency_display_name(StringView locale, StringView currency, Style); +Optional currency_numeric_display_name(StringView locale, StringView currency); + +} diff --git a/Userland/Libraries/LibLocale/Forward.h b/Userland/Libraries/LibLocale/Forward.h index b503f19feb4..dc1fa2b0854 100644 --- a/Userland/Libraries/LibLocale/Forward.h +++ b/Userland/Libraries/LibLocale/Forward.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2022, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -15,9 +15,6 @@ enum class CalendarPatternStyle : u8; enum class CalendarSymbol : u8; enum class CharacterOrder : u8; enum class CompactNumberFormatType : u8; -enum class Condition : u8; -enum class Currency : u16; -enum class DateField : u8; enum class DayPeriod : u8; enum class Era : u8; enum class FirstDayRegion : u8; @@ -30,17 +27,14 @@ enum class KeywordColCaseFirst : u8; enum class KeywordColNumeric : u8; enum class KeywordHours : u8; enum class KeywordNumbers : u8; -enum class Language : u16; enum class ListPatternType : u8; enum class Locale : u16; enum class MinimumDaysRegion : u8; enum class Month : u8; enum class NumericSymbol : u8; enum class PluralCategory : u8; -enum class ScriptTag : u8; enum class StandardNumberFormatType : u8; enum class Style : u8; -enum class Territory : u8; enum class Weekday : u8; enum class WeekendEndRegion : u8; enum class WeekendStartRegion : u8; diff --git a/Userland/Libraries/LibLocale/ICU.cpp b/Userland/Libraries/LibLocale/ICU.cpp new file mode 100644 index 00000000000..35e2dc475fa --- /dev/null +++ b/Userland/Libraries/LibLocale/ICU.cpp @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2024, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#define AK_DONT_REPLACE_STD + +#include +#include +#include +#include + +#include +#include +#include + +namespace Locale { + +static HashMap> s_locale_cache; + +Optional LocaleData::for_locale(StringView locale) +{ + auto locale_data = s_locale_cache.get(locale); + + if (!locale_data.has_value()) { + locale_data = s_locale_cache.ensure(MUST(String::from_utf8(locale)), [&]() -> OwnPtr { + UErrorCode status = U_ZERO_ERROR; + + auto icu_locale = icu::Locale::forLanguageTag(icu_string_piece(locale), status); + if (icu_failure(status)) + return nullptr; + + return adopt_own(*new LocaleData { move(icu_locale) }); + }); + } + + if (locale_data.value()) + return *locale_data.value(); + return {}; +} + +LocaleData::LocaleData(icu::Locale locale) + : m_locale(move(locale)) +{ +} + +String LocaleData::to_string() +{ + if (!m_locale_string.has_value()) { + UErrorCode status = U_ZERO_ERROR; + + auto result = locale().toLanguageTag(status); + VERIFY(icu_success(status)); + + m_locale_string = MUST(result.to_string()); + } + + return *m_locale_string; +} + +icu::LocaleDisplayNames& LocaleData::standard_display_names() +{ + if (!m_standard_display_names) + m_standard_display_names = adopt_own(*icu::LocaleDisplayNames::createInstance(locale())); + return *m_standard_display_names; +} + +icu::LocaleDisplayNames& LocaleData::dialect_display_names() +{ + if (!m_dialect_display_names) + m_dialect_display_names = adopt_own(*icu::LocaleDisplayNames::createInstance(locale(), ULDN_DIALECT_NAMES)); + return *m_dialect_display_names; +} + +icu::DateTimePatternGenerator& LocaleData::date_time_pattern_generator() +{ + if (!m_date_time_pattern_generator) { + UErrorCode status = U_ZERO_ERROR; + + m_date_time_pattern_generator = adopt_own(*icu::DateTimePatternGenerator::createInstance(locale(), status)); + VERIFY(icu_success(status)); + } + + return *m_date_time_pattern_generator; +} + +icu::StringPiece icu_string_piece(StringView string) +{ + return { string.characters_without_null_termination(), static_cast(string.length()) }; +} + +String icu_string_to_string(icu::UnicodeString const& string) +{ + return icu_string_to_string(string.getBuffer(), string.length()); +} + +String icu_string_to_string(UChar const* string, i32 length) +{ + ReadonlySpan view { reinterpret_cast(string), static_cast(length) }; + return MUST(Utf16View { view }.to_utf8()); +} + +} diff --git a/Userland/Libraries/LibLocale/ICU.h b/Userland/Libraries/LibLocale/ICU.h new file mode 100644 index 00000000000..99b998354ad --- /dev/null +++ b/Userland/Libraries/LibLocale/ICU.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#define AK_DONT_REPLACE_STD + +#include +#include +#include +#include + +#include +#include +#include +#include + +U_NAMESPACE_BEGIN +class DateTimePatternGenerator; +class LocaleDisplayNames; +class UnicodeString; +U_NAMESPACE_END + +namespace Locale { + +class LocaleData { +public: + static Optional for_locale(StringView locale); + + ALWAYS_INLINE icu::Locale& locale() { return m_locale; } + + String to_string(); + + icu::LocaleDisplayNames& standard_display_names(); + icu::LocaleDisplayNames& dialect_display_names(); + + icu::DateTimePatternGenerator& date_time_pattern_generator(); + +private: + explicit LocaleData(icu::Locale locale); + + icu::Locale m_locale; + Optional m_locale_string; + + OwnPtr m_standard_display_names; + OwnPtr m_dialect_display_names; + OwnPtr m_date_time_pattern_generator; +}; + +static constexpr bool icu_success(UErrorCode code) +{ + return static_cast(U_SUCCESS(code)); +} + +static constexpr bool icu_failure(UErrorCode code) +{ + return static_cast(U_FAILURE(code)); +} + +icu::StringPiece icu_string_piece(StringView string); +String icu_string_to_string(icu::UnicodeString const& string); +String icu_string_to_string(UChar const*, i32 length); + +} diff --git a/Userland/Libraries/LibLocale/Locale.cpp b/Userland/Libraries/LibLocale/Locale.cpp index 8ec3bd5548e..2011b7df7c9 100644 --- a/Userland/Libraries/LibLocale/Locale.cpp +++ b/Userland/Libraries/LibLocale/Locale.cpp @@ -1,17 +1,25 @@ /* - * Copyright (c) 2021-2023, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ +#define AK_DONT_REPLACE_STD + #include #include #include +#include #include #include +#include #include #include +#include +#include +#include + namespace Locale { static bool is_key(StringView key) @@ -473,266 +481,36 @@ Optional parse_unicode_locale_id(StringView locale) return locale_id; } -static void perform_hard_coded_key_value_substitutions(StringView key, String& value) +String canonicalize_unicode_locale_id(StringView locale) { - // FIXME: In the XML export of CLDR, there are some aliases defined in the following files: - // https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/calendar.xml - // https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/collation.xml - // https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/measure.xml - // https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/timezone.xml - // https://github.com/unicode-org/cldr-staging/blob/master/production/common/bcp47/transform.xml - // - // There isn't yet a counterpart in the JSON export. See: https://unicode-org.atlassian.net/browse/CLDR-14571 - Optional result; + UErrorCode status = U_ZERO_ERROR; - if (key == "ca"sv) { - if (value == "islamicc"sv) - result = "islamic-civil"sv; - else if (value == "ethiopic-amete-alem"sv) - result = "ethioaa"sv; - } else if (key.is_one_of("kb"sv, "kc"sv, "kh"sv, "kk"sv, "kn"sv) && (value == "yes"sv)) { - result = "true"sv; - } else if (key == "ks"sv) { - if (value == "primary"sv) - result = "level1"sv; - else if (value == "tertiary"sv) - result = "level3"sv; - // Note: There are also aliases for "secondary", "quaternary", "quarternary", and "identical", - // but those are semantically incorrect values (they are too long), so they can be skipped. - } else if ((key == "m0"sv) && (value == "names"sv)) { - result = "prprname"sv; - } else if ((key == "ms"sv) && (value == "imperial"sv)) { - result = "uksystem"sv; - } else if (key == "tz"sv) { - // Formatter disabled because this block is easier to read / check against timezone.xml as one-liners. - // clang-format off - if (value == "aqams"sv) result = "nzakl"sv; - else if (value == "cnckg"sv) result = "cnsha"sv; - else if (value == "cnhrb"sv) result = "cnsha"sv; - else if (value == "cnkhg"sv) result = "cnurc"sv; - else if (value == "cuba"sv) result = "cuhav"sv; - else if (value == "egypt"sv) result = "egcai"sv; - else if (value == "eire"sv) result = "iedub"sv; - else if (value == "est"sv) result = "utcw05"sv; - else if (value == "gmt0"sv) result = "gmt"sv; - else if (value == "hongkong"sv) result = "hkhkg"sv; - else if (value == "hst"sv) result = "utcw10"sv; - else if (value == "iceland"sv) result = "isrey"sv; - else if (value == "iran"sv) result = "irthr"sv; - else if (value == "israel"sv) result = "jeruslm"sv; - else if (value == "jamaica"sv) result = "jmkin"sv; - else if (value == "japan"sv) result = "jptyo"sv; - else if (value == "kwajalein"sv) result = "mhkwa"sv; - else if (value == "libya"sv) result = "lytip"sv; - else if (value == "mst"sv) result = "utcw07"sv; - else if (value == "navajo"sv) result = "usden"sv; - else if (value == "poland"sv) result = "plwaw"sv; - else if (value == "portugal"sv) result = "ptlis"sv; - else if (value == "prc"sv) result = "cnsha"sv; - else if (value == "roc"sv) result = "twtpe"sv; - else if (value == "rok"sv) result = "krsel"sv; - else if (value == "singapore"sv) result = "sgsin"sv; - else if (value == "turkey"sv) result = "trist"sv; - else if (value == "uct"sv) result = "utc"sv; - else if (value == "usnavajo"sv) result = "usden"sv; - else if (value == "zulu"sv) result = "utc"sv; - // clang-format on - } + auto locale_data = LocaleData::for_locale(locale); + VERIFY(locale_data.has_value()); - if (result.has_value()) - value = MUST(String::from_utf8(*result)); + locale_data->locale().canonicalize(status); + VERIFY(icu_success(status)); + + return locale_data->to_string(); } -void canonicalize_unicode_extension_values(StringView key, String& value, bool remove_true) +void canonicalize_unicode_extension_values(StringView key, String& value, bool) { - value = MUST(value.to_lowercase()); - perform_hard_coded_key_value_substitutions(key, value); + UErrorCode status = U_ZERO_ERROR; - // Note: The spec says to remove "true" type and tfield values but that is believed to be a bug in the spec - // because, for tvalues, that would result in invalid syntax: - // https://unicode-org.atlassian.net/browse/CLDR-14318 - // This has also been noted by test262: - // https://github.com/tc39/test262/blob/18bb955771669541c56c28748603f6afdb2e25ff/test/intl402/Intl/getCanonicalLocales/transformed-ext-canonical.js - if (remove_true && (value == "true"sv)) { - value = {}; - return; - } + icu::LocaleBuilder builder; + builder.setUnicodeLocaleKeyword(icu_string_piece(key), icu_string_piece(value)); - if (key.is_one_of("sd"sv, "rg"sv)) { - if (auto alias = resolve_subdivision_alias(value); alias.has_value()) { - auto aliases = alias->split_view(' '); + auto locale = builder.build(status); + VERIFY(icu_success(status)); - // FIXME: Subdivision subtags do not appear in the CLDR likelySubtags.json file. - // Implement the spec's recommendation of using just the first alias for now, - // but we should determine if there's anything else needed here. - value = MUST(String::from_utf8(aliases[0])); - } - } -} + locale.canonicalize(status); + VERIFY(icu_success(status)); -static void transform_unicode_locale_id_to_canonical_syntax(LocaleID& locale_id) -{ - auto canonicalize_language = [&](LanguageID& language_id, bool force_lowercase) { - language_id.language = MUST(language_id.language->to_lowercase()); - if (language_id.script.has_value()) - language_id.script = MUST(language_id.script->to_titlecase()); - if (language_id.region.has_value()) - language_id.region = MUST(language_id.region->to_uppercase()); - for (auto& variant : language_id.variants) - variant = MUST(variant.to_lowercase()); + auto result = locale.getUnicodeKeywordValue(icu_string_piece(key), status); + VERIFY(icu_success(status)); - resolve_complex_language_aliases(language_id); - - if (auto alias = resolve_language_alias(*language_id.language); alias.has_value()) { - auto language_alias = parse_unicode_language_id(*alias); - VERIFY(language_alias.has_value()); - - language_id.language = move(language_alias->language); - if (!language_id.script.has_value() && language_alias->script.has_value()) - language_id.script = move(language_alias->script); - if (!language_id.region.has_value() && language_alias->region.has_value()) - language_id.region = move(language_alias->region); - if (language_id.variants.is_empty() && !language_alias->variants.is_empty()) - language_id.variants = move(language_alias->variants); - } - - if (language_id.script.has_value()) { - if (auto alias = resolve_script_tag_alias(*language_id.script); alias.has_value()) - language_id.script = MUST(String::from_utf8(*alias)); - } - - if (language_id.region.has_value()) { - if (auto alias = resolve_territory_alias(*language_id.region); alias.has_value()) - language_id.region = resolve_most_likely_territory_alias(language_id, *alias); - } - - quick_sort(language_id.variants); - - for (auto& variant : language_id.variants) { - variant = MUST(variant.to_lowercase()); - if (auto alias = resolve_variant_alias(variant); alias.has_value()) - variant = MUST(String::from_utf8(*alias)); - } - - if (force_lowercase) { - if (language_id.script.has_value()) - language_id.script = MUST(language_id.script->to_lowercase()); - if (language_id.region.has_value()) - language_id.region = MUST(language_id.region->to_lowercase()); - } - }; - - canonicalize_language(locale_id.language_id, false); - - quick_sort(locale_id.extensions, [](auto const& left, auto const& right) { - auto key = [](auto const& extension) { - return extension.visit( - [](LocaleExtension const&) { return 'u'; }, - [](TransformedExtension const&) { return 't'; }, - [](OtherExtension const& ext) { return static_cast(to_ascii_lowercase(ext.key)); }); - }; - - return key(left) < key(right); - }); - - for (auto& extension : locale_id.extensions) { - extension.visit( - [&](LocaleExtension& ext) { - for (auto& attribute : ext.attributes) - attribute = MUST(attribute.to_lowercase()); - - for (auto& keyword : ext.keywords) { - keyword.key = MUST(keyword.key.to_lowercase()); - canonicalize_unicode_extension_values(keyword.key, keyword.value, true); - } - - quick_sort(ext.attributes); - quick_sort(ext.keywords, [](auto const& a, auto const& b) { return a.key < b.key; }); - }, - [&](TransformedExtension& ext) { - if (ext.language.has_value()) - canonicalize_language(*ext.language, true); - - for (auto& field : ext.fields) { - field.key = MUST(field.key.to_lowercase()); - canonicalize_unicode_extension_values(field.key, field.value, false); - } - - quick_sort(ext.fields, [](auto const& a, auto const& b) { return a.key < b.key; }); - }, - [&](OtherExtension& ext) { - ext.key = static_cast(to_ascii_lowercase(ext.key)); - ext.value = MUST(ext.value.to_lowercase()); - }); - } - - for (auto& extension : locale_id.private_use_extensions) - extension = MUST(extension.to_lowercase()); -} - -Optional canonicalize_unicode_locale_id(LocaleID& locale_id) -{ - // https://unicode.org/reports/tr35/#Canonical_Unicode_Locale_Identifiers - StringBuilder builder; - - auto append_sep_and_string = [&](Optional const& string) { - if (!string.has_value() || string->is_empty()) - return; - builder.appendff("-{}", *string); - }; - - if (!locale_id.language_id.language.has_value()) - return {}; - - transform_unicode_locale_id_to_canonical_syntax(locale_id); - - builder.append(MUST(locale_id.language_id.language->to_lowercase())); - append_sep_and_string(locale_id.language_id.script); - append_sep_and_string(locale_id.language_id.region); - for (auto const& variant : locale_id.language_id.variants) - append_sep_and_string(variant); - - for (auto const& extension : locale_id.extensions) { - extension.visit( - [&](LocaleExtension const& ext) { - builder.append("-u"sv); - - for (auto const& attribute : ext.attributes) - append_sep_and_string(attribute); - for (auto const& keyword : ext.keywords) { - append_sep_and_string(keyword.key); - append_sep_and_string(keyword.value); - } - }, - [&](TransformedExtension const& ext) { - builder.append("-t"sv); - - if (ext.language.has_value()) { - append_sep_and_string(ext.language->language); - append_sep_and_string(ext.language->script); - append_sep_and_string(ext.language->region); - for (auto const& variant : ext.language->variants) - append_sep_and_string(variant); - } - - for (auto const& field : ext.fields) { - append_sep_and_string(field.key); - append_sep_and_string(field.value); - } - }, - [&](OtherExtension const& ext) { - builder.appendff("-{:c}", to_ascii_lowercase(ext.key)); - append_sep_and_string(ext.value); - }); - } - - if (!locale_id.private_use_extensions.is_empty()) { - builder.append("-x"sv); - for (auto const& extension : locale_id.private_use_extensions) - append_sep_and_string(extension); - } - - return MUST(builder.to_string()); + value = MUST(result.to_string()); } StringView default_locale() @@ -775,15 +553,9 @@ ReadonlySpan __attribute__((weak)) get_available_calendars() { retur ReadonlySpan __attribute__((weak)) get_available_collation_case_orderings() { return {}; } ReadonlySpan __attribute__((weak)) get_available_collation_numeric_orderings() { return {}; } ReadonlySpan __attribute__((weak)) get_available_collation_types() { return {}; } -ReadonlySpan __attribute__((weak)) get_available_currencies() { return {}; } ReadonlySpan __attribute__((weak)) get_available_hour_cycles() { return {}; } ReadonlySpan __attribute__((weak)) get_available_number_systems() { return {}; } Optional __attribute__((weak)) locale_from_string(StringView) { return {}; } -Optional __attribute__((weak)) language_from_string(StringView) { return {}; } -Optional __attribute__((weak)) territory_from_string(StringView) { return {}; } -Optional __attribute__((weak)) script_tag_from_string(StringView) { return {}; } -Optional __attribute__((weak)) currency_from_string(StringView) { return {}; } -Optional __attribute__((weak)) date_field_from_string(StringView) { return {}; } Optional __attribute__((weak)) list_pattern_type_from_string(StringView) { return {}; } Optional __attribute__((weak)) key_from_string(StringView) { return {}; } Optional __attribute__((weak)) keyword_ca_from_string(StringView) { return {}; } @@ -794,57 +566,33 @@ Optional __attribute__((weak)) keyword_kn_from_string(StringV Optional __attribute__((weak)) keyword_nu_from_string(StringView) { return {}; } Vector __attribute__((weak)) get_keywords_for_locale(StringView, StringView) { return {}; } Optional __attribute__((weak)) get_preferred_keyword_value_for_locale(StringView, StringView) { return {}; } -Optional __attribute__((weak)) get_locale_display_patterns(StringView) { return {}; } -Optional __attribute__((weak)) get_locale_language_mapping(StringView, StringView) { return {}; } -Optional __attribute__((weak)) get_locale_territory_mapping(StringView, StringView) { return {}; } -Optional __attribute__((weak)) get_locale_script_mapping(StringView, StringView) { return {}; } -Optional __attribute__((weak)) get_locale_long_currency_mapping(StringView, StringView) { return {}; } -Optional __attribute__((weak)) get_locale_short_currency_mapping(StringView, StringView) { return {}; } -Optional __attribute__((weak)) get_locale_narrow_currency_mapping(StringView, StringView) { return {}; } -Optional __attribute__((weak)) get_locale_numeric_currency_mapping(StringView, StringView) { return {}; } -Optional __attribute__((weak)) get_locale_calendar_mapping(StringView, StringView) { return {}; } -Optional __attribute__((weak)) get_locale_long_date_field_mapping(StringView, StringView) { return {}; } -Optional __attribute__((weak)) get_locale_short_date_field_mapping(StringView, StringView) { return {}; } -Optional __attribute__((weak)) get_locale_narrow_date_field_mapping(StringView, StringView) { return {}; } -// https://www.unicode.org/reports/tr35/tr35-39/tr35-general.html#Display_Name_Elements -Optional format_locale_for_display(StringView locale, LocaleID locale_id) +Vector available_currencies() { - auto language_id = move(locale_id.language_id); - VERIFY(language_id.language.has_value()); + UErrorCode status = U_ZERO_ERROR; - auto patterns = get_locale_display_patterns(locale); - if (!patterns.has_value()) + auto* currencies = ucurr_openISOCurrencies(UCURR_ALL, &status); + ScopeGuard guard { [&]() { uenum_close(currencies); } }; + + if (icu_failure(status)) return {}; - auto primary_tag = get_locale_language_mapping(locale, *language_id.language).value_or(*language_id.language); - Optional script; - Optional region; + Vector result; - if (language_id.script.has_value()) - script = get_locale_script_mapping(locale, *language_id.script).value_or(*language_id.script); - if (language_id.region.has_value()) - region = get_locale_territory_mapping(locale, *language_id.region).value_or(*language_id.region); + while (true) { + i32 length = 0; + char const* next = uenum_next(currencies, &length, &status); - Optional secondary_tag; + if (icu_failure(status)) + return {}; + if (next == nullptr) + break; - if (script.has_value() && region.has_value()) { - secondary_tag = MUST(String::from_utf8(patterns->locale_separator)); - secondary_tag = MUST(secondary_tag->replace("{0}"sv, *script, ReplaceMode::FirstOnly)); - secondary_tag = MUST(secondary_tag->replace("{1}"sv, *region, ReplaceMode::FirstOnly)); - } else if (script.has_value()) { - secondary_tag = MUST(String::from_utf8(*script)); - } else if (region.has_value()) { - secondary_tag = MUST(String::from_utf8(*region)); + // https://unicode-org.atlassian.net/browse/ICU-21687 + if (StringView currency { next, static_cast(length) }; currency != "LSM"sv) + result.append(MUST(String::from_utf8(currency))); } - if (!secondary_tag.has_value()) - return MUST(String::from_utf8(primary_tag)); - - auto result = MUST(String::from_utf8(patterns->locale_pattern)); - result = MUST(result.replace("{0}"sv, primary_tag, ReplaceMode::FirstOnly)); - result = MUST(result.replace("{1}"sv, *secondary_tag, ReplaceMode::FirstOnly)); - return result; } @@ -852,12 +600,6 @@ Optional __attribute__((weak)) get_locale_list_patterns(StringView Optional __attribute__((weak)) character_order_from_string(StringView) { return {}; } StringView __attribute__((weak)) character_order_to_string(CharacterOrder) { return {}; } Optional __attribute__((weak)) character_order_for_locale(StringView) { return {}; } -Optional __attribute__((weak)) resolve_language_alias(StringView) { return {}; } -Optional __attribute__((weak)) resolve_territory_alias(StringView) { return {}; } -Optional __attribute__((weak)) resolve_script_tag_alias(StringView) { return {}; } -Optional __attribute__((weak)) resolve_variant_alias(StringView) { return {}; } -Optional __attribute__((weak)) resolve_subdivision_alias(StringView) { return {}; } -void __attribute__((weak)) resolve_complex_language_aliases(LanguageID&) { } Optional __attribute__((weak)) add_likely_subtags(LanguageID const&) { return {}; } Optional remove_likely_subtags(LanguageID const& language_id) @@ -902,21 +644,6 @@ Optional remove_likely_subtags(LanguageID const& language_id) return return_language_and_variants(maximized.release_value(), move(variants)); } -Optional __attribute__((weak)) resolve_most_likely_territory(LanguageID const&) { return {}; } - -String resolve_most_likely_territory_alias(LanguageID const& language_id, StringView territory_alias) -{ - auto aliases = territory_alias.split_view(' '); - - if (aliases.size() > 1) { - auto territory = resolve_most_likely_territory(language_id); - if (territory.has_value() && aliases.contains_slow(*territory)) - return territory.release_value(); - } - - return MUST(String::from_utf8(aliases[0])); -} - String LanguageID::to_string() const { StringBuilder builder; diff --git a/Userland/Libraries/LibLocale/Locale.h b/Userland/Libraries/LibLocale/Locale.h index ef0b445e98e..bc2ff37d736 100644 --- a/Userland/Libraries/LibLocale/Locale.h +++ b/Userland/Libraries/LibLocale/Locale.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, Tim Flynn + * Copyright (c) 2021-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -139,8 +139,8 @@ bool is_type_identifier(StringView); Optional parse_unicode_language_id(StringView); Optional parse_unicode_locale_id(StringView); +String canonicalize_unicode_locale_id(StringView); void canonicalize_unicode_extension_values(StringView key, String& value, bool remove_true); -Optional canonicalize_unicode_locale_id(LocaleID&); StringView default_locale(); bool is_locale_available(StringView locale); @@ -150,19 +150,15 @@ ReadonlySpan get_available_calendars(); ReadonlySpan get_available_collation_case_orderings(); ReadonlySpan get_available_collation_numeric_orderings(); ReadonlySpan get_available_collation_types(); -ReadonlySpan get_available_currencies(); ReadonlySpan get_available_hour_cycles(); ReadonlySpan get_available_number_systems(); +Vector available_currencies(); + Style style_from_string(StringView style); StringView style_to_string(Style style); Optional locale_from_string(StringView locale); -Optional language_from_string(StringView language); -Optional territory_from_string(StringView territory); -Optional script_tag_from_string(StringView script_tag); -Optional currency_from_string(StringView currency); -Optional date_field_from_string(StringView calendar); Optional list_pattern_type_from_string(StringView list_pattern_type); Optional key_from_string(StringView key); @@ -175,38 +171,13 @@ Optional keyword_nu_from_string(StringView nu); Vector get_keywords_for_locale(StringView locale, StringView key); Optional get_preferred_keyword_value_for_locale(StringView locale, StringView key); -Optional get_locale_display_patterns(StringView locale); -Optional format_locale_for_display(StringView locale, LocaleID locale_id); - -Optional get_locale_language_mapping(StringView locale, StringView language); -Optional get_locale_territory_mapping(StringView locale, StringView territory); -Optional get_locale_script_mapping(StringView locale, StringView script); -Optional get_locale_long_currency_mapping(StringView locale, StringView currency); -Optional get_locale_short_currency_mapping(StringView locale, StringView currency); -Optional get_locale_narrow_currency_mapping(StringView locale, StringView currency); -Optional get_locale_numeric_currency_mapping(StringView locale, StringView currency); -Optional get_locale_calendar_mapping(StringView locale, StringView calendar); -Optional get_locale_long_date_field_mapping(StringView locale, StringView date_field); -Optional get_locale_short_date_field_mapping(StringView locale, StringView date_field); -Optional get_locale_narrow_date_field_mapping(StringView locale, StringView date_field); - Optional get_locale_list_patterns(StringView locale, StringView type, Style style); Optional character_order_from_string(StringView character_order); StringView character_order_to_string(CharacterOrder character_order); Optional character_order_for_locale(StringView locale); -Optional resolve_language_alias(StringView language); -Optional resolve_territory_alias(StringView territory); -Optional resolve_script_tag_alias(StringView script_tag); -Optional resolve_variant_alias(StringView variant); -Optional resolve_subdivision_alias(StringView subdivision); -void resolve_complex_language_aliases(LanguageID& language_id); - Optional add_likely_subtags(LanguageID const& language_id); Optional remove_likely_subtags(LanguageID const& language_id); -Optional resolve_most_likely_territory(LanguageID const& language_id); -String resolve_most_likely_territory_alias(LanguageID const& language_id, StringView territory_alias); - }