LibJS+LibUnicode: Fully implement Intl.Collator with ICU

We were never able to implement anything other than a basic, locale-
unaware collator with the JSON export of the CLDR as it did not have
collation data. We can now use ICU to implement collation.
This commit is contained in:
Timothy Flynn 2024-08-14 14:46:19 -04:00 committed by Andreas Kling
commit eb7e3583c9
Notes: github-actions[bot] 2024-08-15 11:45:44 +00:00
11 changed files with 384 additions and 142 deletions

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, Tim Flynn <trflynn89@serenityos.org>
* Copyright (c) 2022-2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
@ -16,83 +16,6 @@ Collator::Collator(Object& prototype)
{
}
void Collator::set_usage(StringView type)
{
if (type == "sort"sv)
m_usage = Usage::Sort;
else if (type == "search"sv)
m_usage = Usage::Search;
else
VERIFY_NOT_REACHED();
}
StringView Collator::usage_string() const
{
switch (m_usage) {
case Usage::Sort:
return "sort"sv;
case Usage::Search:
return "search"sv;
default:
VERIFY_NOT_REACHED();
}
}
void Collator::set_sensitivity(StringView type)
{
if (type == "base"sv)
m_sensitivity = Sensitivity::Base;
else if (type == "accent"sv)
m_sensitivity = Sensitivity::Accent;
else if (type == "case"sv)
m_sensitivity = Sensitivity::Case;
else if (type == "variant"sv)
m_sensitivity = Sensitivity::Variant;
else
VERIFY_NOT_REACHED();
}
StringView Collator::sensitivity_string() const
{
switch (m_sensitivity) {
case Sensitivity::Base:
return "base"sv;
case Sensitivity::Accent:
return "accent"sv;
case Sensitivity::Case:
return "case"sv;
case Sensitivity::Variant:
return "variant"sv;
default:
VERIFY_NOT_REACHED();
}
}
void Collator::set_case_first(StringView case_first)
{
if (case_first == "upper"sv)
m_case_first = CaseFirst::Upper;
else if (case_first == "lower"sv)
m_case_first = CaseFirst::Lower;
else if (case_first == "false"sv)
m_case_first = CaseFirst::False;
else
VERIFY_NOT_REACHED();
}
StringView Collator::case_first_string() const
{
switch (m_case_first) {
case CaseFirst::Upper:
return "upper"sv;
case CaseFirst::Lower:
return "lower"sv;
case CaseFirst::False:
return "false"sv;
default:
VERIFY_NOT_REACHED();
}
}
void Collator::visit_edges(Visitor& visitor)
{
Base::visit_edges(visitor);

View file

@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, Tim Flynn <trflynn89@serenityos.org>
* Copyright (c) 2022-2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
@ -11,6 +11,7 @@
#include <AK/StringView.h>
#include <LibJS/Runtime/Intl/CollatorCompareFunction.h>
#include <LibJS/Runtime/Object.h>
#include <LibUnicode/Collator.h>
namespace JS::Intl {
@ -19,24 +20,6 @@ class Collator final : public Object {
JS_DECLARE_ALLOCATOR(Collator);
public:
enum class Usage {
Sort,
Search,
};
enum class Sensitivity {
Base,
Accent,
Case,
Variant,
};
enum class CaseFirst {
Upper,
Lower,
False,
};
static constexpr auto relevant_extension_keys()
{
// 10.2.3 Internal slots, https://tc39.es/ecma402/#sec-intl-collator-internal-slots
@ -49,17 +32,17 @@ public:
String const& locale() const { return m_locale; }
void set_locale(String locale) { m_locale = move(locale); }
Usage usage() const { return m_usage; }
void set_usage(StringView usage);
StringView usage_string() const;
Unicode::Usage usage() const { return m_usage; }
void set_usage(StringView usage) { m_usage = Unicode::usage_from_string(usage); }
StringView usage_string() const { return Unicode::usage_to_string(m_usage); }
Sensitivity sensitivity() const { return m_sensitivity; }
void set_sensitivity(StringView sensitivity);
StringView sensitivity_string() const;
Unicode::Sensitivity sensitivity() const { return m_sensitivity; }
void set_sensitivity(StringView sensitivity) { m_sensitivity = Unicode::sensitivity_from_string(sensitivity); }
StringView sensitivity_string() const { return Unicode::sensitivity_to_string(m_sensitivity); }
CaseFirst case_first() const { return m_case_first; }
void set_case_first(StringView case_first);
StringView case_first_string() const;
Unicode::CaseFirst case_first() const { return m_case_first; }
void set_case_first(StringView case_first) { m_case_first = Unicode::case_first_from_string(case_first); }
StringView case_first_string() const { return Unicode::case_first_to_string(m_case_first); }
String const& collation() const { return m_collation; }
void set_collation(String collation) { m_collation = move(collation); }
@ -73,19 +56,25 @@ public:
CollatorCompareFunction* bound_compare() const { return m_bound_compare; }
void set_bound_compare(CollatorCompareFunction* bound_compare) { m_bound_compare = bound_compare; }
Unicode::Collator const& collator() const { return *m_collator; }
void set_collator(NonnullOwnPtr<Unicode::Collator> collator) { m_collator = move(collator); }
private:
explicit Collator(Object& prototype);
virtual void visit_edges(Visitor&) override;
String m_locale; // [[Locale]]
Usage m_usage { Usage::Sort }; // [[Usage]]
Sensitivity m_sensitivity { Sensitivity::Variant }; // [[Sensitivity]]
CaseFirst m_case_first { CaseFirst::False }; // [[CaseFirst]]
Unicode::Usage m_usage { Unicode::Usage::Sort }; // [[Usage]]
Unicode::Sensitivity m_sensitivity { Unicode::Sensitivity::Variant }; // [[Sensitivity]]
Unicode::CaseFirst m_case_first { Unicode::CaseFirst::False }; // [[CaseFirst]]
String m_collation; // [[Collation]]
bool m_ignore_punctuation { false }; // [[IgnorePunctuation]]
bool m_numeric { false }; // [[Numeric]]
GCPtr<CollatorCompareFunction> m_bound_compare; // [[BoundCompare]]
// Non-standard. Stores the ICU collator for the Intl object's collation options.
OwnPtr<Unicode::Collator> m_collator;
};
}

View file

@ -31,22 +31,10 @@ void CollatorCompareFunction::initialize(Realm&)
define_direct_property(vm.names.name, PrimitiveString::create(vm, String {}), Attribute::Configurable);
}
// 10.3.3.2 CompareStrings ( collator, x, y ), https://tc39.es/ecma402/#sec-collator-comparestrings
double compare_strings(Collator& collator, Utf8View const& x, Utf8View const& y)
void CollatorCompareFunction::visit_edges(Visitor& visitor)
{
// FIXME: Implement https://unicode.org/reports/tr10
(void)collator;
auto x_iterator = x.begin();
auto y_iterator = y.begin();
for (; x_iterator != x.end() && y_iterator != y.end(); ++x_iterator, ++y_iterator) {
if (*x_iterator != *y_iterator)
return static_cast<double>(*x_iterator) - static_cast<double>(*y_iterator);
}
if (x_iterator != x.end())
return 1.0;
if (y_iterator != y.end())
return -1.0;
return 0.0;
Base::visit_edges(visitor);
visitor.visit(m_collator);
}
// 10.3.3.1 Collator Compare Functions, https://tc39.es/ecma402/#sec-collator-compare-functions
@ -61,17 +49,32 @@ ThrowCompletionOr<Value> CollatorCompareFunction::call()
// 5. Let X be ? ToString(x).
auto x = TRY(vm.argument(0).to_string(vm));
// 6. Let Y be ? ToString(y).
auto y = TRY(vm.argument(1).to_string(vm));
// 7. Return CompareStrings(collator, X, Y).
return compare_strings(m_collator, x.code_points(), y.code_points());
return compare_strings(m_collator, x, y);
}
void CollatorCompareFunction::visit_edges(Visitor& visitor)
// 10.3.3.2 CompareStrings ( collator, x, y ), https://tc39.es/ecma402/#sec-collator-comparestrings
int compare_strings(Collator const& collator, StringView x, StringView y)
{
Base::visit_edges(visitor);
visitor.visit(m_collator);
auto result = collator.collator().compare(x, y);
// The result is intended to correspond with a sort order of String values according to the effective locale and
// collation options of collator, and will be negative when x is ordered before y, positive when x is ordered after
// y, and zero in all other cases (representing no relative ordering between x and y).
switch (result) {
case Unicode::Collator::Order::Before:
return -1;
case Unicode::Collator::Order::Equal:
return 0;
case Unicode::Collator::Order::After:
return 1;
}
VERIFY_NOT_REACHED();
}
}

View file

@ -30,6 +30,6 @@ private:
NonnullGCPtr<Collator> m_collator; // [[Collator]]
};
double compare_strings(Collator&, Utf8View const& x, Utf8View const& y);
int compare_strings(Collator const&, StringView x, StringView y);
}

View file

@ -114,7 +114,7 @@ static ThrowCompletionOr<NonnullGCPtr<Collator>> initialize_collator(VM& vm, Col
// 27. If sensitivity is undefined, then
if (sensitivity.is_undefined()) {
// a. If usage is "sort", then
if (collator.usage() == Collator::Usage::Sort) {
if (collator.usage() == Unicode::Usage::Sort) {
// i. Let sensitivity be "variant".
sensitivity = PrimitiveString::create(vm, "variant"_string);
}
@ -136,6 +136,17 @@ static ThrowCompletionOr<NonnullGCPtr<Collator>> initialize_collator(VM& vm, Col
// 30. Set collator.[[IgnorePunctuation]] to ignorePunctuation.
collator.set_ignore_punctuation(ignore_punctuation.as_bool());
// Non-standard, create an ICU collator for this Intl object.
auto icu_collator = Unicode::Collator::create(
collator.locale(),
collator.usage(),
collator.collation(),
collator.sensitivity(),
collator.case_first(),
collator.numeric(),
collator.ignore_punctuation());
collator.set_collator(move(icu_collator));
// 31. Return collator.
return collator;
}

View file

@ -566,7 +566,7 @@ JS_DEFINE_NATIVE_FUNCTION(StringPrototype::locale_compare)
auto collator = TRY(construct(vm, realm.intrinsics().intl_collator_constructor(), vm.argument(1), vm.argument(2)));
// 5. Return CompareStrings(collator, S, thatValue).
return Intl::compare_strings(static_cast<Intl::Collator&>(*collator), string.code_points(), that_value.code_points());
return Intl::compare_strings(static_cast<Intl::Collator const&>(*collator), string, that_value);
}
// 22.1.3.13 String.prototype.match ( regexp ), https://tc39.es/ecma262/#sec-string.prototype.match

View file

@ -17,13 +17,13 @@ describe("correct behavior", () => {
const aTob = collator.compare(a, b);
const bToa = collator.compare(b, a);
expect(aTob > 0).toBeTrue();
expect(aTob).toBe(-bToa);
expect(aTob).toBe(1);
expect(bToa).toBe(-1);
}
compareBoth("a", "");
compareBoth("1", "");
compareBoth("a", "A");
compareBoth("A", "a");
compareBoth("7", "3");
compareBoth("0000", "0");
@ -31,8 +31,65 @@ describe("correct behavior", () => {
expect(collator.compare("undefined", undefined)).toBe(0);
expect(collator.compare("null", null)).toBe(0);
expect(collator.compare("null", undefined)).not.toBe(0);
expect(collator.compare("null") < 0).toBeTrue();
expect(collator.compare("null", undefined)).toBe(-1);
expect(collator.compare("null")).toBe(-1);
});
test("canonically equivalent strings", () => {
var tests = [
["ä\u0306", "a\u0308\u0306"],
["ă\u0308", "a\u0306\u0308"],
["ạ\u0308", "a\u0323\u0308"],
["a\u0308\u0323", "a\u0323\u0308"],
["ä\u0323", "a\u0323\u0308"],
["Å", "Å"],
["Å", "A\u030A"],
["Ç", "C\u0327"],
["ḋ\u0323", "ḍ\u0307"],
["ḋ\u0323", "d\u0323\u0307"],
["ô", "o\u0302"],
["ö", "o\u0308"],
["q\u0307\u0323", "q\u0323\u0307"],
["ṩ", "s\u0323\u0307"],
["ự", "ụ\u031B"],
["ự", "u\u031B\u0323"],
["ự", "ư\u0323"],
["ự", "u\u0323\u031B"],
["Ω", "Ω"],
["x\u031B\u0323", "x\u0323\u031B"],
["퓛", "\u1111\u1171\u11B6"],
["北", "\uD87E\uDC2B"],
["가", "\u1100\u1161"],
["\uD834\uDD5E", "\uD834\uDD57\uD834\uDD65"],
];
const en = new Intl.Collator("en");
const ja = new Intl.Collator("ja");
const th = new Intl.Collator("th");
tests.forEach(test => {
expect(en.compare(test[0], test[1])).toBe(0);
expect(ja.compare(test[0], test[1])).toBe(0);
expect(th.compare(test[0], test[1])).toBe(0);
});
});
test("ignorePunctuation", () => {
[undefined, true, false].forEach(ignorePunctuation => {
let expected = false;
const en = new Intl.Collator("en", { ignorePunctuation });
expect(en.compare("", " ")).toBe(en.resolvedOptions().ignorePunctuation ? 0 : -1);
expect(en.compare("", ",")).toBe(en.resolvedOptions().ignorePunctuation ? 0 : -1);
const ja = new Intl.Collator("ja", { ignorePunctuation });
expect(ja.compare("", " ")).toBe(ja.resolvedOptions().ignorePunctuation ? 0 : -1);
expect(ja.compare("", ",")).toBe(ja.resolvedOptions().ignorePunctuation ? 0 : -1);
const th = new Intl.Collator("th", { ignorePunctuation });
expect(th.compare("", " ")).toBe(th.resolvedOptions().ignorePunctuation ? 0 : -1);
expect(th.compare("", ",")).toBe(th.resolvedOptions().ignorePunctuation ? 0 : -1);
});
});
test("UTF-16", () => {

View file

@ -15,7 +15,7 @@ test("basic functionality", () => {
compareBoth("a", "");
compareBoth("1", "");
compareBoth("a", "A");
compareBoth("A", "a");
compareBoth("7", "3");
compareBoth("0000", "0");

View file

@ -2,6 +2,7 @@ include(${SerenityOS_SOURCE_DIR}/Meta/CMake/unicode_data.cmake)
set(SOURCES
CharacterTypes.cpp
Collator.cpp
CurrencyCode.cpp
DateTimeFormat.cpp
DisplayNames.cpp

View file

@ -0,0 +1,196 @@
/*
* Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibUnicode/Collator.h>
#include <LibUnicode/ICU.h>
#include <unicode/coll.h>
namespace Unicode {
Usage usage_from_string(StringView usage)
{
if (usage == "sort"sv)
return Usage::Sort;
if (usage == "search"sv)
return Usage::Search;
VERIFY_NOT_REACHED();
}
StringView usage_to_string(Usage usage)
{
switch (usage) {
case Usage::Sort:
return "sort"sv;
case Usage::Search:
return "search"sv;
}
VERIFY_NOT_REACHED();
}
static NonnullOwnPtr<icu::Locale> apply_usage_to_locale(icu::Locale const& locale, Usage usage, StringView collation)
{
auto result = adopt_own(*locale.clone());
UErrorCode status = U_ZERO_ERROR;
switch (usage) {
case Usage::Sort:
result->setUnicodeKeywordValue("co", icu_string_piece(collation), status);
break;
case Usage::Search:
result->setUnicodeKeywordValue("co", "search", status);
break;
}
VERIFY(icu_success(status));
return result;
}
Sensitivity sensitivity_from_string(StringView sensitivity)
{
if (sensitivity == "base"sv)
return Sensitivity::Base;
if (sensitivity == "accent"sv)
return Sensitivity::Accent;
if (sensitivity == "case"sv)
return Sensitivity::Case;
if (sensitivity == "variant"sv)
return Sensitivity::Variant;
VERIFY_NOT_REACHED();
}
StringView sensitivity_to_string(Sensitivity sensitivity)
{
switch (sensitivity) {
case Sensitivity::Base:
return "base"sv;
case Sensitivity::Accent:
return "accent"sv;
case Sensitivity::Case:
return "case"sv;
case Sensitivity::Variant:
return "variant"sv;
}
VERIFY_NOT_REACHED();
}
static constexpr UColAttributeValue icu_sensitivity(Sensitivity sensitivity)
{
switch (sensitivity) {
case Sensitivity::Base:
return UCOL_PRIMARY;
case Sensitivity::Accent:
return UCOL_SECONDARY;
case Sensitivity::Case:
return UCOL_PRIMARY;
case Sensitivity::Variant:
return UCOL_TERTIARY;
}
VERIFY_NOT_REACHED();
}
CaseFirst case_first_from_string(StringView case_first)
{
if (case_first == "upper"sv)
return CaseFirst::Upper;
if (case_first == "lower"sv)
return CaseFirst::Lower;
if (case_first == "false"sv)
return CaseFirst::False;
VERIFY_NOT_REACHED();
}
StringView case_first_to_string(CaseFirst case_first)
{
switch (case_first) {
case CaseFirst::Upper:
return "upper"sv;
case CaseFirst::Lower:
return "lower"sv;
case CaseFirst::False:
return "false"sv;
}
VERIFY_NOT_REACHED();
}
static constexpr UColAttributeValue icu_case_first(CaseFirst case_first)
{
switch (case_first) {
case CaseFirst::Upper:
return UCOL_UPPER_FIRST;
case CaseFirst::Lower:
return UCOL_LOWER_FIRST;
case CaseFirst::False:
return UCOL_OFF;
}
VERIFY_NOT_REACHED();
}
class CollatorImpl : public Collator {
public:
explicit CollatorImpl(NonnullOwnPtr<icu::Collator> collator)
: m_collator(move(collator))
{
}
virtual Collator::Order compare(StringView lhs, StringView rhs) const override
{
UErrorCode status = U_ZERO_ERROR;
auto result = m_collator->compareUTF8(icu_string_piece(lhs), icu_string_piece(rhs), status);
VERIFY(icu_success(status));
switch (result) {
case UCOL_LESS:
return Order::Before;
case UCOL_EQUAL:
return Order::Equal;
case UCOL_GREATER:
return Order::After;
}
VERIFY_NOT_REACHED();
}
private:
NonnullOwnPtr<icu::Collator> m_collator;
};
NonnullOwnPtr<Collator> Collator::create(
StringView locale,
Usage usage,
StringView collation,
Sensitivity sensitivity,
CaseFirst case_first,
bool numeric,
bool ignore_punctuation)
{
UErrorCode status = U_ZERO_ERROR;
auto locale_data = LocaleData::for_locale(locale);
VERIFY(locale_data.has_value());
auto locale_with_usage = apply_usage_to_locale(locale_data->locale(), usage, collation);
auto collator = adopt_own(*icu::Collator::createInstance(*locale_with_usage, status));
VERIFY(icu_success(status));
auto set_attribute = [&](UColAttribute attribute, UColAttributeValue value) {
collator->setAttribute(attribute, value, status);
VERIFY(icu_success(status));
};
set_attribute(UCOL_STRENGTH, icu_sensitivity(sensitivity));
set_attribute(UCOL_CASE_LEVEL, sensitivity == Sensitivity::Case ? UCOL_ON : UCOL_OFF);
set_attribute(UCOL_CASE_FIRST, icu_case_first(case_first));
set_attribute(UCOL_NUMERIC_COLLATION, numeric ? UCOL_ON : UCOL_OFF);
set_attribute(UCOL_ALTERNATE_HANDLING, ignore_punctuation ? UCOL_SHIFTED : UCOL_NON_IGNORABLE);
set_attribute(UCOL_NORMALIZATION_MODE, UCOL_ON);
return adopt_own(*new CollatorImpl(move(collator)));
}
}

View file

@ -0,0 +1,62 @@
/*
* Copyright (c) 2024, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/NonnullOwnPtr.h>
#include <AK/StringView.h>
namespace Unicode {
enum class Usage {
Sort,
Search,
};
Usage usage_from_string(StringView);
StringView usage_to_string(Usage);
enum class Sensitivity {
Base,
Accent,
Case,
Variant,
};
Sensitivity sensitivity_from_string(StringView);
StringView sensitivity_to_string(Sensitivity);
enum class CaseFirst {
Upper,
Lower,
False,
};
CaseFirst case_first_from_string(StringView);
StringView case_first_to_string(CaseFirst);
class Collator {
public:
static NonnullOwnPtr<Collator> create(
StringView locale,
Usage,
StringView collation,
Sensitivity,
CaseFirst,
bool numeric,
bool ignore_punctuation);
virtual ~Collator() = default;
enum class Order {
Before,
Equal,
After,
};
virtual Order compare(StringView, StringView) const = 0;
protected:
Collator() = default;
};
}