diff --git a/Userland/Libraries/LibJS/Runtime/Intl/Collator.cpp b/Userland/Libraries/LibJS/Runtime/Intl/Collator.cpp index 175e629b5b6..eb7debf414f 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/Collator.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/Collator.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, Tim Flynn + * Copyright (c) 2022-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -16,83 +16,6 @@ Collator::Collator(Object& prototype) { } -void Collator::set_usage(StringView type) -{ - if (type == "sort"sv) - m_usage = Usage::Sort; - else if (type == "search"sv) - m_usage = Usage::Search; - else - VERIFY_NOT_REACHED(); -} - -StringView Collator::usage_string() const -{ - switch (m_usage) { - case Usage::Sort: - return "sort"sv; - case Usage::Search: - return "search"sv; - default: - VERIFY_NOT_REACHED(); - } -} - -void Collator::set_sensitivity(StringView type) -{ - if (type == "base"sv) - m_sensitivity = Sensitivity::Base; - else if (type == "accent"sv) - m_sensitivity = Sensitivity::Accent; - else if (type == "case"sv) - m_sensitivity = Sensitivity::Case; - else if (type == "variant"sv) - m_sensitivity = Sensitivity::Variant; - else - VERIFY_NOT_REACHED(); -} - -StringView Collator::sensitivity_string() const -{ - switch (m_sensitivity) { - case Sensitivity::Base: - return "base"sv; - case Sensitivity::Accent: - return "accent"sv; - case Sensitivity::Case: - return "case"sv; - case Sensitivity::Variant: - return "variant"sv; - default: - VERIFY_NOT_REACHED(); - } -} - -void Collator::set_case_first(StringView case_first) -{ - if (case_first == "upper"sv) - m_case_first = CaseFirst::Upper; - else if (case_first == "lower"sv) - m_case_first = CaseFirst::Lower; - else if (case_first == "false"sv) - m_case_first = CaseFirst::False; - else - VERIFY_NOT_REACHED(); -} - -StringView Collator::case_first_string() const -{ - switch (m_case_first) { - case CaseFirst::Upper: - return "upper"sv; - case CaseFirst::Lower: - return "lower"sv; - case CaseFirst::False: - return "false"sv; - default: - VERIFY_NOT_REACHED(); - } -} void Collator::visit_edges(Visitor& visitor) { Base::visit_edges(visitor); diff --git a/Userland/Libraries/LibJS/Runtime/Intl/Collator.h b/Userland/Libraries/LibJS/Runtime/Intl/Collator.h index 207f8830540..f35c9e44da7 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/Collator.h +++ b/Userland/Libraries/LibJS/Runtime/Intl/Collator.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, Tim Flynn + * Copyright (c) 2022-2024, Tim Flynn * * SPDX-License-Identifier: BSD-2-Clause */ @@ -11,6 +11,7 @@ #include #include #include +#include namespace JS::Intl { @@ -19,24 +20,6 @@ class Collator final : public Object { JS_DECLARE_ALLOCATOR(Collator); public: - enum class Usage { - Sort, - Search, - }; - - enum class Sensitivity { - Base, - Accent, - Case, - Variant, - }; - - enum class CaseFirst { - Upper, - Lower, - False, - }; - static constexpr auto relevant_extension_keys() { // 10.2.3 Internal slots, https://tc39.es/ecma402/#sec-intl-collator-internal-slots @@ -49,17 +32,17 @@ public: String const& locale() const { return m_locale; } void set_locale(String locale) { m_locale = move(locale); } - Usage usage() const { return m_usage; } - void set_usage(StringView usage); - StringView usage_string() const; + Unicode::Usage usage() const { return m_usage; } + void set_usage(StringView usage) { m_usage = Unicode::usage_from_string(usage); } + StringView usage_string() const { return Unicode::usage_to_string(m_usage); } - Sensitivity sensitivity() const { return m_sensitivity; } - void set_sensitivity(StringView sensitivity); - StringView sensitivity_string() const; + Unicode::Sensitivity sensitivity() const { return m_sensitivity; } + void set_sensitivity(StringView sensitivity) { m_sensitivity = Unicode::sensitivity_from_string(sensitivity); } + StringView sensitivity_string() const { return Unicode::sensitivity_to_string(m_sensitivity); } - CaseFirst case_first() const { return m_case_first; } - void set_case_first(StringView case_first); - StringView case_first_string() const; + Unicode::CaseFirst case_first() const { return m_case_first; } + void set_case_first(StringView case_first) { m_case_first = Unicode::case_first_from_string(case_first); } + StringView case_first_string() const { return Unicode::case_first_to_string(m_case_first); } String const& collation() const { return m_collation; } void set_collation(String collation) { m_collation = move(collation); } @@ -73,19 +56,25 @@ public: CollatorCompareFunction* bound_compare() const { return m_bound_compare; } void set_bound_compare(CollatorCompareFunction* bound_compare) { m_bound_compare = bound_compare; } + Unicode::Collator const& collator() const { return *m_collator; } + void set_collator(NonnullOwnPtr collator) { m_collator = move(collator); } + private: explicit Collator(Object& prototype); virtual void visit_edges(Visitor&) override; - String m_locale; // [[Locale]] - Usage m_usage { Usage::Sort }; // [[Usage]] - Sensitivity m_sensitivity { Sensitivity::Variant }; // [[Sensitivity]] - CaseFirst m_case_first { CaseFirst::False }; // [[CaseFirst]] - String m_collation; // [[Collation]] - bool m_ignore_punctuation { false }; // [[IgnorePunctuation]] - bool m_numeric { false }; // [[Numeric]] - GCPtr m_bound_compare; // [[BoundCompare]] + String m_locale; // [[Locale]] + Unicode::Usage m_usage { Unicode::Usage::Sort }; // [[Usage]] + Unicode::Sensitivity m_sensitivity { Unicode::Sensitivity::Variant }; // [[Sensitivity]] + Unicode::CaseFirst m_case_first { Unicode::CaseFirst::False }; // [[CaseFirst]] + String m_collation; // [[Collation]] + bool m_ignore_punctuation { false }; // [[IgnorePunctuation]] + bool m_numeric { false }; // [[Numeric]] + GCPtr m_bound_compare; // [[BoundCompare]] + + // Non-standard. Stores the ICU collator for the Intl object's collation options. + OwnPtr m_collator; }; } diff --git a/Userland/Libraries/LibJS/Runtime/Intl/CollatorCompareFunction.cpp b/Userland/Libraries/LibJS/Runtime/Intl/CollatorCompareFunction.cpp index faae88b4b6f..7246cb3601f 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/CollatorCompareFunction.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/CollatorCompareFunction.cpp @@ -31,22 +31,10 @@ void CollatorCompareFunction::initialize(Realm&) define_direct_property(vm.names.name, PrimitiveString::create(vm, String {}), Attribute::Configurable); } -// 10.3.3.2 CompareStrings ( collator, x, y ), https://tc39.es/ecma402/#sec-collator-comparestrings -double compare_strings(Collator& collator, Utf8View const& x, Utf8View const& y) +void CollatorCompareFunction::visit_edges(Visitor& visitor) { - // FIXME: Implement https://unicode.org/reports/tr10 - (void)collator; - auto x_iterator = x.begin(); - auto y_iterator = y.begin(); - for (; x_iterator != x.end() && y_iterator != y.end(); ++x_iterator, ++y_iterator) { - if (*x_iterator != *y_iterator) - return static_cast(*x_iterator) - static_cast(*y_iterator); - } - if (x_iterator != x.end()) - return 1.0; - if (y_iterator != y.end()) - return -1.0; - return 0.0; + Base::visit_edges(visitor); + visitor.visit(m_collator); } // 10.3.3.1 Collator Compare Functions, https://tc39.es/ecma402/#sec-collator-compare-functions @@ -61,17 +49,32 @@ ThrowCompletionOr CollatorCompareFunction::call() // 5. Let X be ? ToString(x). auto x = TRY(vm.argument(0).to_string(vm)); + // 6. Let Y be ? ToString(y). auto y = TRY(vm.argument(1).to_string(vm)); // 7. Return CompareStrings(collator, X, Y). - return compare_strings(m_collator, x.code_points(), y.code_points()); + return compare_strings(m_collator, x, y); } -void CollatorCompareFunction::visit_edges(Visitor& visitor) +// 10.3.3.2 CompareStrings ( collator, x, y ), https://tc39.es/ecma402/#sec-collator-comparestrings +int compare_strings(Collator const& collator, StringView x, StringView y) { - Base::visit_edges(visitor); - visitor.visit(m_collator); + auto result = collator.collator().compare(x, y); + + // The result is intended to correspond with a sort order of String values according to the effective locale and + // collation options of collator, and will be negative when x is ordered before y, positive when x is ordered after + // y, and zero in all other cases (representing no relative ordering between x and y). + switch (result) { + case Unicode::Collator::Order::Before: + return -1; + case Unicode::Collator::Order::Equal: + return 0; + case Unicode::Collator::Order::After: + return 1; + } + + VERIFY_NOT_REACHED(); } } diff --git a/Userland/Libraries/LibJS/Runtime/Intl/CollatorCompareFunction.h b/Userland/Libraries/LibJS/Runtime/Intl/CollatorCompareFunction.h index 89975842282..e6c3d5417cf 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/CollatorCompareFunction.h +++ b/Userland/Libraries/LibJS/Runtime/Intl/CollatorCompareFunction.h @@ -30,6 +30,6 @@ private: NonnullGCPtr m_collator; // [[Collator]] }; -double compare_strings(Collator&, Utf8View const& x, Utf8View const& y); +int compare_strings(Collator const&, StringView x, StringView y); } diff --git a/Userland/Libraries/LibJS/Runtime/Intl/CollatorConstructor.cpp b/Userland/Libraries/LibJS/Runtime/Intl/CollatorConstructor.cpp index 0557d596ab6..a20904cb789 100644 --- a/Userland/Libraries/LibJS/Runtime/Intl/CollatorConstructor.cpp +++ b/Userland/Libraries/LibJS/Runtime/Intl/CollatorConstructor.cpp @@ -114,7 +114,7 @@ static ThrowCompletionOr> initialize_collator(VM& vm, Col // 27. If sensitivity is undefined, then if (sensitivity.is_undefined()) { // a. If usage is "sort", then - if (collator.usage() == Collator::Usage::Sort) { + if (collator.usage() == Unicode::Usage::Sort) { // i. Let sensitivity be "variant". sensitivity = PrimitiveString::create(vm, "variant"_string); } @@ -136,6 +136,17 @@ static ThrowCompletionOr> initialize_collator(VM& vm, Col // 30. Set collator.[[IgnorePunctuation]] to ignorePunctuation. collator.set_ignore_punctuation(ignore_punctuation.as_bool()); + // Non-standard, create an ICU collator for this Intl object. + auto icu_collator = Unicode::Collator::create( + collator.locale(), + collator.usage(), + collator.collation(), + collator.sensitivity(), + collator.case_first(), + collator.numeric(), + collator.ignore_punctuation()); + collator.set_collator(move(icu_collator)); + // 31. Return collator. return collator; } diff --git a/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp b/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp index d3cb804eafb..94f64ec76df 100644 --- a/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp +++ b/Userland/Libraries/LibJS/Runtime/StringPrototype.cpp @@ -566,7 +566,7 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::locale_compare) auto collator = TRY(construct(vm, realm.intrinsics().intl_collator_constructor(), vm.argument(1), vm.argument(2))); // 5. Return CompareStrings(collator, S, thatValue). - return Intl::compare_strings(static_cast(*collator), string.code_points(), that_value.code_points()); + return Intl::compare_strings(static_cast(*collator), string, that_value); } // 22.1.3.13 String.prototype.match ( regexp ), https://tc39.es/ecma262/#sec-string.prototype.match diff --git a/Userland/Libraries/LibJS/Tests/builtins/Intl/Collator/Collator.prototype.compare.js b/Userland/Libraries/LibJS/Tests/builtins/Intl/Collator/Collator.prototype.compare.js index ec27f2a2fb6..fdfc10c9ff6 100644 --- a/Userland/Libraries/LibJS/Tests/builtins/Intl/Collator/Collator.prototype.compare.js +++ b/Userland/Libraries/LibJS/Tests/builtins/Intl/Collator/Collator.prototype.compare.js @@ -17,13 +17,13 @@ describe("correct behavior", () => { const aTob = collator.compare(a, b); const bToa = collator.compare(b, a); - expect(aTob > 0).toBeTrue(); - expect(aTob).toBe(-bToa); + expect(aTob).toBe(1); + expect(bToa).toBe(-1); } compareBoth("a", ""); compareBoth("1", ""); - compareBoth("a", "A"); + compareBoth("A", "a"); compareBoth("7", "3"); compareBoth("0000", "0"); @@ -31,8 +31,65 @@ describe("correct behavior", () => { expect(collator.compare("undefined", undefined)).toBe(0); expect(collator.compare("null", null)).toBe(0); - expect(collator.compare("null", undefined)).not.toBe(0); - expect(collator.compare("null") < 0).toBeTrue(); + expect(collator.compare("null", undefined)).toBe(-1); + expect(collator.compare("null")).toBe(-1); + }); + + test("canonically equivalent strings", () => { + var tests = [ + ["ä\u0306", "a\u0308\u0306"], + ["ă\u0308", "a\u0306\u0308"], + ["ạ\u0308", "a\u0323\u0308"], + ["a\u0308\u0323", "a\u0323\u0308"], + ["ä\u0323", "a\u0323\u0308"], + ["Å", "Å"], + ["Å", "A\u030A"], + ["Ç", "C\u0327"], + ["ḋ\u0323", "ḍ\u0307"], + ["ḋ\u0323", "d\u0323\u0307"], + ["ô", "o\u0302"], + ["ö", "o\u0308"], + ["q\u0307\u0323", "q\u0323\u0307"], + ["ṩ", "s\u0323\u0307"], + ["ự", "ụ\u031B"], + ["ự", "u\u031B\u0323"], + ["ự", "ư\u0323"], + ["ự", "u\u0323\u031B"], + ["Ω", "Ω"], + ["x\u031B\u0323", "x\u0323\u031B"], + ["퓛", "\u1111\u1171\u11B6"], + ["北", "\uD87E\uDC2B"], + ["가", "\u1100\u1161"], + ["\uD834\uDD5E", "\uD834\uDD57\uD834\uDD65"], + ]; + + const en = new Intl.Collator("en"); + const ja = new Intl.Collator("ja"); + const th = new Intl.Collator("th"); + + tests.forEach(test => { + expect(en.compare(test[0], test[1])).toBe(0); + expect(ja.compare(test[0], test[1])).toBe(0); + expect(th.compare(test[0], test[1])).toBe(0); + }); + }); + + test("ignorePunctuation", () => { + [undefined, true, false].forEach(ignorePunctuation => { + let expected = false; + + const en = new Intl.Collator("en", { ignorePunctuation }); + expect(en.compare("", " ")).toBe(en.resolvedOptions().ignorePunctuation ? 0 : -1); + expect(en.compare("", ",")).toBe(en.resolvedOptions().ignorePunctuation ? 0 : -1); + + const ja = new Intl.Collator("ja", { ignorePunctuation }); + expect(ja.compare("", " ")).toBe(ja.resolvedOptions().ignorePunctuation ? 0 : -1); + expect(ja.compare("", ",")).toBe(ja.resolvedOptions().ignorePunctuation ? 0 : -1); + + const th = new Intl.Collator("th", { ignorePunctuation }); + expect(th.compare("", " ")).toBe(th.resolvedOptions().ignorePunctuation ? 0 : -1); + expect(th.compare("", ",")).toBe(th.resolvedOptions().ignorePunctuation ? 0 : -1); + }); }); test("UTF-16", () => { diff --git a/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.localeCompare.js b/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.localeCompare.js index 5d6bebc2d8e..36076eefde8 100644 --- a/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.localeCompare.js +++ b/Userland/Libraries/LibJS/Tests/builtins/String/String.prototype.localeCompare.js @@ -15,7 +15,7 @@ test("basic functionality", () => { compareBoth("a", ""); compareBoth("1", ""); - compareBoth("a", "A"); + compareBoth("A", "a"); compareBoth("7", "3"); compareBoth("0000", "0"); diff --git a/Userland/Libraries/LibUnicode/CMakeLists.txt b/Userland/Libraries/LibUnicode/CMakeLists.txt index cc99684d018..b0ae1bc1b63 100644 --- a/Userland/Libraries/LibUnicode/CMakeLists.txt +++ b/Userland/Libraries/LibUnicode/CMakeLists.txt @@ -2,6 +2,7 @@ include(${SerenityOS_SOURCE_DIR}/Meta/CMake/unicode_data.cmake) set(SOURCES CharacterTypes.cpp + Collator.cpp CurrencyCode.cpp DateTimeFormat.cpp DisplayNames.cpp diff --git a/Userland/Libraries/LibUnicode/Collator.cpp b/Userland/Libraries/LibUnicode/Collator.cpp new file mode 100644 index 00000000000..617eabc386d --- /dev/null +++ b/Userland/Libraries/LibUnicode/Collator.cpp @@ -0,0 +1,196 @@ +/* + * Copyright (c) 2024, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include + +#include + +namespace Unicode { + +Usage usage_from_string(StringView usage) +{ + if (usage == "sort"sv) + return Usage::Sort; + if (usage == "search"sv) + return Usage::Search; + VERIFY_NOT_REACHED(); +} + +StringView usage_to_string(Usage usage) +{ + switch (usage) { + case Usage::Sort: + return "sort"sv; + case Usage::Search: + return "search"sv; + } + VERIFY_NOT_REACHED(); +} + +static NonnullOwnPtr apply_usage_to_locale(icu::Locale const& locale, Usage usage, StringView collation) +{ + auto result = adopt_own(*locale.clone()); + UErrorCode status = U_ZERO_ERROR; + + switch (usage) { + case Usage::Sort: + result->setUnicodeKeywordValue("co", icu_string_piece(collation), status); + break; + case Usage::Search: + result->setUnicodeKeywordValue("co", "search", status); + break; + } + + VERIFY(icu_success(status)); + return result; +} + +Sensitivity sensitivity_from_string(StringView sensitivity) +{ + if (sensitivity == "base"sv) + return Sensitivity::Base; + if (sensitivity == "accent"sv) + return Sensitivity::Accent; + if (sensitivity == "case"sv) + return Sensitivity::Case; + if (sensitivity == "variant"sv) + return Sensitivity::Variant; + VERIFY_NOT_REACHED(); +} + +StringView sensitivity_to_string(Sensitivity sensitivity) +{ + switch (sensitivity) { + case Sensitivity::Base: + return "base"sv; + case Sensitivity::Accent: + return "accent"sv; + case Sensitivity::Case: + return "case"sv; + case Sensitivity::Variant: + return "variant"sv; + } + VERIFY_NOT_REACHED(); +} + +static constexpr UColAttributeValue icu_sensitivity(Sensitivity sensitivity) +{ + switch (sensitivity) { + case Sensitivity::Base: + return UCOL_PRIMARY; + case Sensitivity::Accent: + return UCOL_SECONDARY; + case Sensitivity::Case: + return UCOL_PRIMARY; + case Sensitivity::Variant: + return UCOL_TERTIARY; + } + VERIFY_NOT_REACHED(); +} + +CaseFirst case_first_from_string(StringView case_first) +{ + if (case_first == "upper"sv) + return CaseFirst::Upper; + if (case_first == "lower"sv) + return CaseFirst::Lower; + if (case_first == "false"sv) + return CaseFirst::False; + VERIFY_NOT_REACHED(); +} + +StringView case_first_to_string(CaseFirst case_first) +{ + switch (case_first) { + case CaseFirst::Upper: + return "upper"sv; + case CaseFirst::Lower: + return "lower"sv; + case CaseFirst::False: + return "false"sv; + } + VERIFY_NOT_REACHED(); +} + +static constexpr UColAttributeValue icu_case_first(CaseFirst case_first) +{ + switch (case_first) { + case CaseFirst::Upper: + return UCOL_UPPER_FIRST; + case CaseFirst::Lower: + return UCOL_LOWER_FIRST; + case CaseFirst::False: + return UCOL_OFF; + } + VERIFY_NOT_REACHED(); +} + +class CollatorImpl : public Collator { +public: + explicit CollatorImpl(NonnullOwnPtr collator) + : m_collator(move(collator)) + { + } + + virtual Collator::Order compare(StringView lhs, StringView rhs) const override + { + UErrorCode status = U_ZERO_ERROR; + + auto result = m_collator->compareUTF8(icu_string_piece(lhs), icu_string_piece(rhs), status); + VERIFY(icu_success(status)); + + switch (result) { + case UCOL_LESS: + return Order::Before; + case UCOL_EQUAL: + return Order::Equal; + case UCOL_GREATER: + return Order::After; + } + + VERIFY_NOT_REACHED(); + } + +private: + NonnullOwnPtr m_collator; +}; + +NonnullOwnPtr Collator::create( + StringView locale, + Usage usage, + StringView collation, + Sensitivity sensitivity, + CaseFirst case_first, + bool numeric, + bool ignore_punctuation) +{ + UErrorCode status = U_ZERO_ERROR; + + auto locale_data = LocaleData::for_locale(locale); + VERIFY(locale_data.has_value()); + + auto locale_with_usage = apply_usage_to_locale(locale_data->locale(), usage, collation); + + auto collator = adopt_own(*icu::Collator::createInstance(*locale_with_usage, status)); + VERIFY(icu_success(status)); + + auto set_attribute = [&](UColAttribute attribute, UColAttributeValue value) { + collator->setAttribute(attribute, value, status); + VERIFY(icu_success(status)); + }; + + set_attribute(UCOL_STRENGTH, icu_sensitivity(sensitivity)); + set_attribute(UCOL_CASE_LEVEL, sensitivity == Sensitivity::Case ? UCOL_ON : UCOL_OFF); + set_attribute(UCOL_CASE_FIRST, icu_case_first(case_first)); + set_attribute(UCOL_NUMERIC_COLLATION, numeric ? UCOL_ON : UCOL_OFF); + set_attribute(UCOL_ALTERNATE_HANDLING, ignore_punctuation ? UCOL_SHIFTED : UCOL_NON_IGNORABLE); + set_attribute(UCOL_NORMALIZATION_MODE, UCOL_ON); + + return adopt_own(*new CollatorImpl(move(collator))); +} + +} diff --git a/Userland/Libraries/LibUnicode/Collator.h b/Userland/Libraries/LibUnicode/Collator.h new file mode 100644 index 00000000000..1949139eef5 --- /dev/null +++ b/Userland/Libraries/LibUnicode/Collator.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2024, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include + +namespace Unicode { + +enum class Usage { + Sort, + Search, +}; +Usage usage_from_string(StringView); +StringView usage_to_string(Usage); + +enum class Sensitivity { + Base, + Accent, + Case, + Variant, +}; +Sensitivity sensitivity_from_string(StringView); +StringView sensitivity_to_string(Sensitivity); + +enum class CaseFirst { + Upper, + Lower, + False, +}; +CaseFirst case_first_from_string(StringView); +StringView case_first_to_string(CaseFirst); + +class Collator { +public: + static NonnullOwnPtr create( + StringView locale, + Usage, + StringView collation, + Sensitivity, + CaseFirst, + bool numeric, + bool ignore_punctuation); + + virtual ~Collator() = default; + + enum class Order { + Before, + Equal, + After, + }; + virtual Order compare(StringView, StringView) const = 0; + +protected: + Collator() = default; +}; + +}