AK: Support UTF-16 string formatting

The underlying storage used during string formatting is StringBuilder.
To support UTF-16 strings, this patch allows callers to specify a mode
during StringBuilder construction. The default mode is UTF-8, for which
StringBuilder remains unchanged.

In UTF-16 mode, we treat the StringBuilder's internal ByteBuffer as a
series of u16 code units. Appending a single character will append 2
bytes for that character (cast to a char16_t). Appending a StringView
will transcode the string to UTF-16.

Utf16String also gains the same memory optimization that we added for
String, where we hand-off the underlying buffer to Utf16String to avoid
having to re-allocate.

In the future, we may want to further optimize for ASCII strings. For
example, we could defer committing to the u16-esque storage until we
see a non-ASCII code point.
This commit is contained in:
Timothy Flynn 2025-06-17 16:08:30 -04:00 committed by Tim Flynn
commit 2803d66d87
Notes: github-actions[bot] 2025-07-18 16:47:24 +00:00
11 changed files with 362 additions and 55 deletions

View file

@ -235,6 +235,80 @@ TEST_CASE(from_utf32)
}
}
TEST_CASE(formatted)
{
{
auto string = Utf16String::formatted("{}", 42);
EXPECT(!string.is_empty());
EXPECT(string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 2uz);
EXPECT_EQ(string.length_in_code_points(), 2uz);
EXPECT_EQ(string, u"42"sv);
}
{
auto string = Utf16String::number(42);
EXPECT(!string.is_empty());
EXPECT(string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 2uz);
EXPECT_EQ(string.length_in_code_points(), 2uz);
EXPECT_EQ(string, u"42"sv);
}
{
auto string = Utf16String::formatted("whf {} {} {}!", "😀"sv, Utf16View { u"🍕"sv }, 3.14);
EXPECT(!string.is_empty());
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 15uz);
EXPECT_EQ(string.length_in_code_points(), 13uz);
EXPECT_EQ(string, u"whf 😀 🍕 3.14!"sv);
}
{
Array segments {
u"abcdefghijklmnopqrstuvwxyz"sv,
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
u"abcdefghijklmnopqrstuvwxyz"sv,
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
u"abcdefghijklmnopqrstuvwxyz"sv,
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
};
auto string = Utf16String::join(u"--"sv, segments);
EXPECT(!string.is_empty());
EXPECT(string.is_ascii());
EXPECT(string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 166uz);
EXPECT_EQ(string.length_in_code_points(), 166uz);
EXPECT_EQ(string, u"abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ--abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ--abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv);
}
{
Array segments {
u"abcdefghijklmnopqrstuvwxyz"sv,
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
u"\xd83d\xde00"sv,
u"abcdefghijklmnopqrstuvwxyz"sv,
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
u"🍕"sv,
u"abcdefghijklmnopqrstuvwxyz"sv,
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
};
auto string = Utf16String::join(u"--"sv, segments);
EXPECT(!string.is_empty());
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 174uz);
EXPECT_EQ(string.length_in_code_points(), 172uz);
EXPECT_EQ(string, u"abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ--😀--abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ--🍕--abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv);
}
}
TEST_CASE(copy_operations)
{
auto test = [](Utf16String const& string1) {