AK+LibUnicode: Implement Unicode-aware UTF-16 case transformations

This commit is contained in:
Timothy Flynn 2025-07-25 09:33:08 -04:00 committed by Jelle Raaijmakers
commit a740bfd8ff
Notes: github-actions[bot] 2025-07-25 16:18:07 +00:00
5 changed files with 524 additions and 0 deletions

View file

@ -138,6 +138,13 @@ public:
return from_string_builder_without_validation(builder);
}
// These methods require linking LibUnicode.
Utf16String to_lowercase(Optional<StringView> const& locale = {}) const;
Utf16String to_uppercase(Optional<StringView> const& locale = {}) const;
Utf16String to_titlecase(Optional<StringView> const& locale = {}, TrailingCodePointTransformation trailing_code_point_transformation = TrailingCodePointTransformation::Lowercase) const;
Utf16String to_casefold() const;
Utf16String to_fullwidth() const;
ALWAYS_INLINE Utf16String to_ascii_lowercase() const
{
auto view = utf16_view();

View file

@ -17,6 +17,7 @@ set(SOURCES
String.cpp
TimeZone.cpp
UnicodeKeywords.cpp
Utf16String.cpp
)
set(GENERATED_SOURCES ${CURRENT_LIB_GENERATED})

View file

@ -0,0 +1,87 @@
/*
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Utf16String.h>
#include <LibUnicode/ICU.h>
#include <unicode/stringoptions.h>
#include <unicode/translit.h>
#include <unicode/unistr.h>
// This file contains definitions of AK::Utf16String methods which require UCD data.
namespace AK {
Utf16String Utf16String::to_lowercase(Optional<StringView> const& locale) const
{
if (has_ascii_storage() && !locale.has_value())
return to_ascii_lowercase();
Optional<Unicode::LocaleData&> locale_data;
if (locale.has_value())
locale_data = Unicode::LocaleData::for_locale(*locale);
auto icu_string = Unicode::icu_string(*this);
locale_data.has_value() ? icu_string.toLower(locale_data->locale()) : icu_string.toLower();
return Unicode::icu_string_to_utf16_string(icu_string);
}
Utf16String Utf16String::to_uppercase(Optional<StringView> const& locale) const
{
if (has_ascii_storage() && !locale.has_value())
return to_ascii_uppercase();
Optional<Unicode::LocaleData&> locale_data;
if (locale.has_value())
locale_data = Unicode::LocaleData::for_locale(*locale);
auto icu_string = Unicode::icu_string(*this);
locale_data.has_value() ? icu_string.toUpper(locale_data->locale()) : icu_string.toUpper();
return Unicode::icu_string_to_utf16_string(icu_string);
}
Utf16String Utf16String::to_titlecase(Optional<StringView> const& locale, TrailingCodePointTransformation trailing_code_point_transformation) const
{
Optional<Unicode::LocaleData&> locale_data;
if (locale.has_value())
locale_data = Unicode::LocaleData::for_locale(*locale);
u32 options = 0;
if (trailing_code_point_transformation == TrailingCodePointTransformation::PreserveExisting)
options |= U_TITLECASE_NO_LOWERCASE;
auto icu_string = Unicode::icu_string(*this);
locale_data.has_value()
? icu_string.toTitle(nullptr, locale_data->locale(), options)
: icu_string.toTitle(nullptr, icu::Locale::getDefault(), options);
return Unicode::icu_string_to_utf16_string(icu_string);
}
Utf16String Utf16String::to_casefold() const
{
auto icu_string = Unicode::icu_string(*this);
icu_string.foldCase();
return Unicode::icu_string_to_utf16_string(icu_string);
}
Utf16String Utf16String::to_fullwidth() const
{
UErrorCode status = U_ZERO_ERROR;
auto const transliterator = adopt_own_if_nonnull(icu::Transliterator::createInstance("Halfwidth-Fullwidth", UTRANS_FORWARD, status));
VERIFY(Unicode::icu_success(status));
auto icu_string = Unicode::icu_string(*this);
transliterator->transliterate(icu_string);
return Unicode::icu_string_to_utf16_string(icu_string);
}
}

View file

@ -113,6 +113,7 @@ if (CXX_COMPILER_SUPPORTS_OBJC_ARC)
endif()
target_link_libraries(TestString PRIVATE LibUnicode)
target_link_libraries(TestUtf16String PRIVATE LibUnicode)
if (ENABLE_SWIFT)

View file

@ -411,6 +411,434 @@ TEST_CASE(repeated)
EXPECT_DEATH("Creating a string from an invalid code point", (void)Utf16String::repeated(0xffffffff, 1));
}
TEST_CASE(to_lowercase_unconditional_special_casing)
{
// LATIN SMALL LETTER SHARP S
auto result = "\u00DF"_utf16.to_lowercase();
EXPECT_EQ(result, u"\u00DF"sv);
// LATIN CAPITAL LETTER I WITH DOT ABOVE
result = "\u0130"_utf16.to_lowercase();
EXPECT_EQ(result, u"\u0069\u0307"sv);
// LATIN SMALL LIGATURE FF
result = "\uFB00"_utf16.to_lowercase();
EXPECT_EQ(result, u"\uFB00"sv);
// LATIN SMALL LIGATURE FI
result = "\uFB01"_utf16.to_lowercase();
EXPECT_EQ(result, u"\uFB01"sv);
// LATIN SMALL LIGATURE FL
result = "\uFB02"_utf16.to_lowercase();
EXPECT_EQ(result, u"\uFB02"sv);
// LATIN SMALL LIGATURE FFI
result = "\uFB03"_utf16.to_lowercase();
EXPECT_EQ(result, u"\uFB03"sv);
// LATIN SMALL LIGATURE FFL
result = "\uFB04"_utf16.to_lowercase();
EXPECT_EQ(result, u"\uFB04"sv);
// LATIN SMALL LIGATURE LONG S T
result = "\uFB05"_utf16.to_lowercase();
EXPECT_EQ(result, u"\uFB05"sv);
// LATIN SMALL LIGATURE ST
result = "\uFB06"_utf16.to_lowercase();
EXPECT_EQ(result, u"\uFB06"sv);
// GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
result = "\u1FB7"_utf16.to_lowercase();
EXPECT_EQ(result, u"\u1FB7"sv);
// GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
result = "\u1FC7"_utf16.to_lowercase();
EXPECT_EQ(result, u"\u1FC7"sv);
// GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
result = "\u1FF7"_utf16.to_lowercase();
EXPECT_EQ(result, u"\u1FF7"sv);
}
TEST_CASE(to_lowercase_special_casing_sigma)
{
auto result = "ABCI"_utf16.to_lowercase();
EXPECT_EQ(result, u"abci"sv);
// Sigma preceded by A
result = "A\u03A3"_utf16.to_lowercase();
EXPECT_EQ(result, u"a\u03C2"sv);
// Sigma preceded by FEMININE ORDINAL INDICATOR
result = "\u00AA\u03A3"_utf16.to_lowercase();
EXPECT_EQ(result, u"\u00AA\u03C2"sv);
// Sigma preceded by ROMAN NUMERAL ONE
result = "\u2160\u03A3"_utf16.to_lowercase();
EXPECT_EQ(result, u"\u2170\u03C2"sv);
// Sigma preceded by COMBINING GREEK YPOGEGRAMMENI
result = "\u0345\u03A3"_utf16.to_lowercase();
EXPECT_EQ(result, u"\u0345\u03C3"sv);
// Sigma preceded by A and FULL STOP
result = "A.\u03A3"_utf16.to_lowercase();
EXPECT_EQ(result, u"a.\u03C2"sv);
// Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR
result = "A\u180E\u03A3"_utf16.to_lowercase();
EXPECT_EQ(result, u"a\u180E\u03C2"sv);
// Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by B
result = "A\u180E\u03A3B"_utf16.to_lowercase();
EXPECT_EQ(result, u"a\u180E\u03C3b"sv);
// Sigma followed by A
result = "\u03A3A"_utf16.to_lowercase();
EXPECT_EQ(result, u"\u03C3a"sv);
// Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR
result = "A\u03A3\u180E"_utf16.to_lowercase();
EXPECT_EQ(result, u"a\u03C2\u180E"sv);
// Sigma preceded by A, followed by MONGOLIAN VOWEL SEPARATOR and B
result = "A\u03A3\u180EB"_utf16.to_lowercase();
EXPECT_EQ(result, u"a\u03C3\u180Eb"sv);
// Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR
result = "A\u180E\u03A3\u180E"_utf16.to_lowercase();
EXPECT_EQ(result, u"a\u180E\u03C2\u180E"sv);
// Sigma preceded by A and MONGOLIAN VOWEL SEPARATOR, followed by MONGOLIAN VOWEL SEPARATOR and B
result = "A\u180E\u03A3\u180EB"_utf16.to_lowercase();
EXPECT_EQ(result, u"a\u180E\u03C3\u180Eb"sv);
}
TEST_CASE(to_lowercase_special_casing_i)
{
// LATIN CAPITAL LETTER I
auto result = "I"_utf16.to_lowercase("en"sv);
EXPECT_EQ(result, u"i"sv);
result = "I"_utf16.to_lowercase("az"sv);
EXPECT_EQ(result, u"\u0131"sv);
result = "I"_utf16.to_lowercase("tr"sv);
EXPECT_EQ(result, u"\u0131"sv);
// LATIN CAPITAL LETTER I WITH DOT ABOVE
result = "\u0130"_utf16.to_lowercase("en"sv);
EXPECT_EQ(result, u"\u0069\u0307"sv);
result = "\u0130"_utf16.to_lowercase("az"sv);
EXPECT_EQ(result, u"i"sv);
result = "\u0130"_utf16.to_lowercase("tr"sv);
EXPECT_EQ(result, u"i"sv);
// LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE
result = "I\u0307"_utf16.to_lowercase("en"sv);
EXPECT_EQ(result, u"i\u0307"sv);
result = "I\u0307"_utf16.to_lowercase("az"sv);
EXPECT_EQ(result, u"i"sv);
result = "I\u0307"_utf16.to_lowercase("tr"sv);
EXPECT_EQ(result, u"i"sv);
// LATIN CAPITAL LETTER I followed by combining class 0 and COMBINING DOT ABOVE
result = "IA\u0307"_utf16.to_lowercase("en"sv);
EXPECT_EQ(result, u"ia\u0307"sv);
result = "IA\u0307"_utf16.to_lowercase("az"sv);
EXPECT_EQ(result, u"\u0131a\u0307"sv);
result = "IA\u0307"_utf16.to_lowercase("tr"sv);
EXPECT_EQ(result, u"\u0131a\u0307"sv);
}
TEST_CASE(to_lowercase_special_casing_more_above)
{
// LATIN CAPITAL LETTER I
auto result = "I"_utf16.to_lowercase("en"sv);
EXPECT_EQ(result, u"i"sv);
result = "I"_utf16.to_lowercase("lt"sv);
EXPECT_EQ(result, u"i"sv);
// LATIN CAPITAL LETTER J
result = "J"_utf16.to_lowercase("en"sv);
EXPECT_EQ(result, u"j"sv);
result = "J"_utf16.to_lowercase("lt"sv);
EXPECT_EQ(result, u"j"sv);
// LATIN CAPITAL LETTER I WITH OGONEK
result = "\u012e"_utf16.to_lowercase("en"sv);
EXPECT_EQ(result, u"\u012f"sv);
result = "\u012e"_utf16.to_lowercase("lt"sv);
EXPECT_EQ(result, u"\u012f"sv);
// LATIN CAPITAL LETTER I followed by COMBINING GRAVE ACCENT
result = "I\u0300"_utf16.to_lowercase("en"sv);
EXPECT_EQ(result, u"i\u0300"sv);
result = "I\u0300"_utf16.to_lowercase("lt"sv);
EXPECT_EQ(result, u"i\u0307\u0300"sv);
// LATIN CAPITAL LETTER J followed by COMBINING GRAVE ACCENT
result = "J\u0300"_utf16.to_lowercase("en"sv);
EXPECT_EQ(result, u"j\u0300"sv);
result = "J\u0300"_utf16.to_lowercase("lt"sv);
EXPECT_EQ(result, u"j\u0307\u0300"sv);
// LATIN CAPITAL LETTER I WITH OGONEK followed by COMBINING GRAVE ACCENT
result = "\u012e\u0300"_utf16.to_lowercase("en"sv);
EXPECT_EQ(result, u"\u012f\u0300"sv);
result = "\u012e\u0300"_utf16.to_lowercase("lt"sv);
EXPECT_EQ(result, u"\u012f\u0307\u0300"sv);
}
TEST_CASE(to_lowercase_special_casing_not_before_dot)
{
// LATIN CAPITAL LETTER I
auto result = "I"_utf16.to_lowercase("en"sv);
EXPECT_EQ(result, u"i"sv);
result = "I"_utf16.to_lowercase("az"sv);
EXPECT_EQ(result, u"\u0131"sv);
result = "I"_utf16.to_lowercase("tr"sv);
EXPECT_EQ(result, u"\u0131"sv);
// LATIN CAPITAL LETTER I followed by COMBINING DOT ABOVE
result = "I\u0307"_utf16.to_lowercase("en"sv);
EXPECT_EQ(result, u"i\u0307"sv);
result = "I\u0307"_utf16.to_lowercase("az"sv);
EXPECT_EQ(result, u"i"sv);
result = "I\u0307"_utf16.to_lowercase("tr"sv);
EXPECT_EQ(result, u"i"sv);
}
TEST_CASE(to_uppercase_unconditional_special_casing)
{
// LATIN SMALL LETTER SHARP S
auto result = "\u00DF"_utf16.to_uppercase();
EXPECT_EQ(result, u"\u0053\u0053"sv);
// LATIN CAPITAL LETTER I WITH DOT ABOVE
result = "\u0130"_utf16.to_uppercase();
EXPECT_EQ(result, u"\u0130"sv);
// LATIN SMALL LIGATURE FF
result = "\uFB00"_utf16.to_uppercase();
EXPECT_EQ(result, u"\u0046\u0046"sv);
// LATIN SMALL LIGATURE FI
result = "\uFB01"_utf16.to_uppercase();
EXPECT_EQ(result, u"\u0046\u0049"sv);
// LATIN SMALL LIGATURE FL
result = "\uFB02"_utf16.to_uppercase();
EXPECT_EQ(result, u"\u0046\u004C"sv);
// LATIN SMALL LIGATURE FFI
result = "\uFB03"_utf16.to_uppercase();
EXPECT_EQ(result, u"\u0046\u0046\u0049"sv);
// LATIN SMALL LIGATURE FFL
result = "\uFB04"_utf16.to_uppercase();
EXPECT_EQ(result, u"\u0046\u0046\u004C"sv);
// LATIN SMALL LIGATURE LONG S T
result = "\uFB05"_utf16.to_uppercase();
EXPECT_EQ(result, u"\u0053\u0054"sv);
// LATIN SMALL LIGATURE ST
result = "\uFB06"_utf16.to_uppercase();
EXPECT_EQ(result, u"\u0053\u0054"sv);
// GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
result = "\u0390"_utf16.to_uppercase();
EXPECT_EQ(result, u"\u0399\u0308\u0301"sv);
// GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
result = "\u03B0"_utf16.to_uppercase();
EXPECT_EQ(result, u"\u03A5\u0308\u0301"sv);
// GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
result = "\u1FB7"_utf16.to_uppercase();
EXPECT_EQ(result, u"\u0391\u0342\u0399"sv);
// GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
result = "\u1FC7"_utf16.to_uppercase();
EXPECT_EQ(result, u"\u0397\u0342\u0399"sv);
// GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
result = "\u1FF7"_utf16.to_uppercase();
EXPECT_EQ(result, u"\u03A9\u0342\u0399"sv);
}
TEST_CASE(to_uppercase_special_casing_soft_dotted)
{
// LATIN SMALL LETTER I
auto result = "i"_utf16.to_uppercase("en"sv);
EXPECT_EQ(result, u"I"sv);
result = "i"_utf16.to_uppercase("lt"sv);
EXPECT_EQ(result, u"I"sv);
// LATIN SMALL LETTER J
result = "j"_utf16.to_uppercase("en"sv);
EXPECT_EQ(result, u"J"sv);
result = "j"_utf16.to_uppercase("lt"sv);
EXPECT_EQ(result, u"J"sv);
// LATIN SMALL LETTER I followed by COMBINING DOT ABOVE
result = "i\u0307"_utf16.to_uppercase("en"sv);
EXPECT_EQ(result, u"I\u0307"sv);
result = "i\u0307"_utf16.to_uppercase("lt"sv);
EXPECT_EQ(result, u"I"sv);
// LATIN SMALL LETTER J followed by COMBINING DOT ABOVE
result = "j\u0307"_utf16.to_uppercase("en"sv);
EXPECT_EQ(result, u"J\u0307"sv);
result = "j\u0307"_utf16.to_uppercase("lt"sv);
EXPECT_EQ(result, u"J"sv);
}
TEST_CASE(to_titlecase)
{
EXPECT_EQ(""_utf16.to_titlecase(), ""sv);
EXPECT_EQ(" "_utf16.to_titlecase(), " "sv);
EXPECT_EQ(" - "_utf16.to_titlecase(), " - "sv);
EXPECT_EQ("a"_utf16.to_titlecase(), "A"sv);
EXPECT_EQ("A"_utf16.to_titlecase(), "A"sv);
EXPECT_EQ(" a"_utf16.to_titlecase(), " A"sv);
EXPECT_EQ("a "_utf16.to_titlecase(), "A "sv);
EXPECT_EQ("ab"_utf16.to_titlecase(), "Ab"sv);
EXPECT_EQ("Ab"_utf16.to_titlecase(), "Ab"sv);
EXPECT_EQ("aB"_utf16.to_titlecase(), "Ab"sv);
EXPECT_EQ("AB"_utf16.to_titlecase(), "Ab"sv);
EXPECT_EQ(" ab"_utf16.to_titlecase(), " Ab"sv);
EXPECT_EQ("ab "_utf16.to_titlecase(), "Ab "sv);
EXPECT_EQ("foo bar baz"_utf16.to_titlecase(), "Foo Bar Baz"sv);
EXPECT_EQ("foo \n \r bar \t baz"_utf16.to_titlecase(), "Foo \n \r Bar \t Baz"sv);
EXPECT_EQ("f\"oo\" b'ar'"_utf16.to_titlecase(), "F\"Oo\" B'ar'"sv);
}
TEST_CASE(to_titlecase_unconditional_special_casing)
{
// LATIN SMALL LETTER SHARP S
auto result = "\u00DF"_utf16.to_titlecase();
EXPECT_EQ(result, u"\u0053\u0073"sv);
// LATIN CAPITAL LETTER I WITH DOT ABOVE
result = "\u0130"_utf16.to_titlecase();
EXPECT_EQ(result, u"\u0130"sv);
// LATIN SMALL LIGATURE FF
result = "\uFB00"_utf16.to_titlecase();
EXPECT_EQ(result, u"\u0046\u0066"sv);
// LATIN SMALL LIGATURE FI
result = "\uFB01"_utf16.to_titlecase();
EXPECT_EQ(result, u"\u0046\u0069"sv);
// LATIN SMALL LIGATURE FL
result = "\uFB02"_utf16.to_titlecase();
EXPECT_EQ(result, u"\u0046\u006C"sv);
// LATIN SMALL LIGATURE FFI
result = "\uFB03"_utf16.to_titlecase();
EXPECT_EQ(result, u"\u0046\u0066\u0069"sv);
// LATIN SMALL LIGATURE FFL
result = "\uFB04"_utf16.to_titlecase();
EXPECT_EQ(result, u"\u0046\u0066\u006C"sv);
// LATIN SMALL LIGATURE LONG S T
result = "\uFB05"_utf16.to_titlecase();
EXPECT_EQ(result, u"\u0053\u0074"sv);
// LATIN SMALL LIGATURE ST
result = "\uFB06"_utf16.to_titlecase();
EXPECT_EQ(result, u"\u0053\u0074"sv);
// GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
result = "\u0390"_utf16.to_titlecase();
EXPECT_EQ(result, u"\u0399\u0308\u0301"sv);
// GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
result = "\u03B0"_utf16.to_titlecase();
EXPECT_EQ(result, u"\u03A5\u0308\u0301"sv);
// GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
result = "\u1FB7"_utf16.to_titlecase();
EXPECT_EQ(result, u"\u0391\u0342\u0345"sv);
// GREEK SMALL LETTER ETA WITH PERISPOMENI AND YPOGEGRAMMENI
result = "\u1FC7"_utf16.to_titlecase();
EXPECT_EQ(result, u"\u0397\u0342\u0345"sv);
// GREEK SMALL LETTER OMEGA WITH PERISPOMENI AND YPOGEGRAMMENI
result = "\u1FF7"_utf16.to_titlecase();
EXPECT_EQ(result, u"\u03A9\u0342\u0345"sv);
}
TEST_CASE(to_titlecase_special_casing_i)
{
// LATIN SMALL LETTER I
auto result = "i"_utf16.to_titlecase("en"sv);
EXPECT_EQ(result, u"I"sv);
result = "i"_utf16.to_titlecase("az"sv);
EXPECT_EQ(result, u"\u0130"sv);
result = "i"_utf16.to_titlecase("tr"sv);
EXPECT_EQ(result, u"\u0130"sv);
}
TEST_CASE(to_casefold)
{
for (u8 code_point = 0; code_point < 0x80; ++code_point) {
auto ascii = to_ascii_lowercase(code_point);
auto unicode = Utf16String::from_code_point(code_point).to_casefold();
EXPECT_EQ(unicode.length_in_code_units(), 1uz);
EXPECT_EQ(unicode.code_unit_at(0), ascii);
}
// LATIN SMALL LETTER SHARP S
auto result = "\u00DF"_utf16.to_casefold();
EXPECT_EQ(result, u"\u0073\u0073"sv);
// GREEK SMALL LETTER ALPHA WITH YPOGEGRAMMENI
result = "\u1FB3"_utf16.to_casefold();
EXPECT_EQ(result, u"\u03B1\u03B9"sv);
// GREEK SMALL LETTER ALPHA WITH PERISPOMENI
result = "\u1FB6"_utf16.to_casefold();
EXPECT_EQ(result, u"\u03B1\u0342"sv);
// GREEK SMALL LETTER ALPHA WITH PERISPOMENI AND YPOGEGRAMMENI
result = "\u1FB7"_utf16.to_casefold();
EXPECT_EQ(result, u"\u03B1\u0342\u03B9"sv);
}
TEST_CASE(copy_operations)
{
auto test = [](Utf16String const& string1) {