AK: Support UTF-16 string formatting

The underlying storage used during string formatting is StringBuilder.
To support UTF-16 strings, this patch allows callers to specify a mode
during StringBuilder construction. The default mode is UTF-8, for which
StringBuilder remains unchanged.

In UTF-16 mode, we treat the StringBuilder's internal ByteBuffer as a
series of u16 code units. Appending a single character will append 2
bytes for that character (cast to a char16_t). Appending a StringView
will transcode the string to UTF-16.

Utf16String also gains the same memory optimization that we added for
String, where we hand-off the underlying buffer to Utf16String to avoid
having to re-allocate.

In the future, we may want to further optimize for ASCII strings. For
example, we could defer committing to the u16-esque storage until we
see a non-ASCII code point.
This commit is contained in:
Timothy Flynn 2025-06-17 16:08:30 -04:00 committed by Tim Flynn
parent fe676585f5
commit 2803d66d87
Notes: github-actions[bot] 2025-07-18 16:47:24 +00:00
11 changed files with 362 additions and 55 deletions

View file

@ -14,6 +14,8 @@
#include <AK/StringData.h>
#include <AK/StringView.h>
#include <AK/UnicodeUtils.h>
#include <AK/Utf16String.h>
#include <AK/Utf16StringData.h>
#include <AK/Utf16View.h>
#include <AK/Utf32View.h>
@ -21,45 +23,70 @@
namespace AK {
static constexpr auto STRING_BASE_PREFIX_SIZE = sizeof(Detail::StringData);
static constexpr size_t string_builder_prefix_size(StringBuilder::Mode mode)
{
switch (mode) {
case StringBuilder::Mode::UTF8:
return sizeof(Detail::StringData);
case StringBuilder::Mode::UTF16:
return Detail::Utf16StringData::offset_of_string_storage();
}
VERIFY_NOT_REACHED();
}
static ErrorOr<StringBuilder::Buffer> create_buffer(size_t capacity)
static ErrorOr<StringBuilder::Buffer> create_buffer(StringBuilder::Mode mode, size_t capacity)
{
StringBuilder::Buffer buffer;
auto prefix_size = string_builder_prefix_size(mode);
if (capacity > StringBuilder::inline_capacity)
TRY(buffer.try_ensure_capacity(STRING_BASE_PREFIX_SIZE + capacity));
TRY(buffer.try_ensure_capacity(prefix_size + capacity));
TRY(buffer.try_resize(STRING_BASE_PREFIX_SIZE));
TRY(buffer.try_resize(prefix_size));
return buffer;
}
ErrorOr<StringBuilder> StringBuilder::create(size_t initial_capacity)
{
auto buffer = TRY(create_buffer(initial_capacity));
return StringBuilder { move(buffer) };
auto buffer = TRY(create_buffer(DEFAULT_MODE, initial_capacity));
return StringBuilder { move(buffer), DEFAULT_MODE };
}
StringBuilder::StringBuilder()
{
static_assert(inline_capacity > STRING_BASE_PREFIX_SIZE);
m_buffer.resize(STRING_BASE_PREFIX_SIZE);
static constexpr auto prefix_size = string_builder_prefix_size(DEFAULT_MODE);
static_assert(inline_capacity > prefix_size);
m_buffer.resize(prefix_size);
}
StringBuilder::StringBuilder(size_t initial_capacity)
: m_buffer(MUST(create_buffer(initial_capacity)))
: m_buffer(MUST(create_buffer(DEFAULT_MODE, initial_capacity)))
{
}
StringBuilder::StringBuilder(Buffer buffer)
StringBuilder::StringBuilder(Mode mode)
: m_buffer(MUST(create_buffer(mode, inline_capacity)))
, m_mode(mode)
{
}
StringBuilder::StringBuilder(Mode mode, size_t initial_capacity_in_code_units)
: m_buffer(MUST(create_buffer(mode, initial_capacity_in_code_units * (mode == Mode::UTF8 ? 1 : 2))))
, m_mode(mode)
{
}
StringBuilder::StringBuilder(Buffer buffer, Mode mode)
: m_buffer(move(buffer))
, m_mode(mode)
{
}
inline ErrorOr<void> StringBuilder::will_append(size_t size)
inline ErrorOr<void> StringBuilder::will_append(size_t size_in_bytes)
{
Checked<size_t> needed_capacity = m_buffer.size();
needed_capacity += size;
needed_capacity += size_in_bytes;
VERIFY(!needed_capacity.has_overflow());
// Prefer to completely use the existing capacity first
if (needed_capacity <= m_buffer.capacity())
@ -73,7 +100,7 @@ inline ErrorOr<void> StringBuilder::will_append(size_t size)
size_t StringBuilder::length() const
{
return m_buffer.size() - STRING_BASE_PREFIX_SIZE;
return m_buffer.size() - string_builder_prefix_size(m_mode);
}
bool StringBuilder::is_empty() const
@ -83,6 +110,9 @@ bool StringBuilder::is_empty() const
void StringBuilder::trim(size_t count)
{
if (m_mode == Mode::UTF16)
count *= 2;
auto decrease_count = min(m_buffer.size(), count);
m_buffer.resize(m_buffer.size() - decrease_count);
}
@ -91,21 +121,55 @@ ErrorOr<void> StringBuilder::try_append(StringView string)
{
if (string.is_empty())
return {};
switch (m_mode) {
case StringBuilder::Mode::UTF8:
TRY(will_append(string.length()));
TRY(m_buffer.try_append(string.characters_without_null_termination(), string.length()));
break;
case StringBuilder::Mode::UTF16:
TRY(will_append(string.length() * 2));
for (auto code_point : Utf8View { string })
TRY(try_append_code_point(code_point));
break;
}
return {};
}
ErrorOr<void> StringBuilder::try_append(char ch)
{
switch (m_mode) {
case StringBuilder::Mode::UTF8:
TRY(will_append(1));
TRY(m_buffer.try_append(ch));
break;
case StringBuilder::Mode::UTF16:
TRY(try_append_code_unit(ch));
break;
}
return {};
}
ErrorOr<void> StringBuilder::try_append_code_unit(char16_t ch)
{
switch (m_mode) {
case StringBuilder::Mode::UTF8:
TRY(try_append_code_point(ch));
break;
case StringBuilder::Mode::UTF16:
TRY(will_append(2));
TRY(m_buffer.try_append(&ch, sizeof(ch)));
break;
}
return {};
}
ErrorOr<void> StringBuilder::try_append_repeated(char ch, size_t n)
{
TRY(will_append(n));
TRY(will_append(n * (m_mode == Mode::UTF8 ? 1 : 2)));
for (size_t i = 0; i < n; ++i)
TRY(try_append(ch));
return {};
@ -115,7 +179,7 @@ ErrorOr<void> StringBuilder::try_append_repeated(StringView string, size_t n)
{
if (string.is_empty())
return {};
TRY(will_append(string.length() * n));
TRY(will_append(string.length() * (m_mode == Mode::UTF8 ? 1 : 2)));
for (size_t i = 0; i < n; ++i)
TRY(try_append(string));
return {};
@ -141,6 +205,11 @@ void StringBuilder::append(char ch)
MUST(try_append(ch));
}
void StringBuilder::append_code_unit(char16_t ch)
{
MUST(try_append_code_unit(ch));
}
void StringBuilder::append_repeated(char ch, size_t n)
{
MUST(try_append_repeated(ch, n));
@ -158,6 +227,7 @@ ErrorOr<ByteBuffer> StringBuilder::to_byte_buffer() const
ByteString StringBuilder::to_byte_string() const
{
VERIFY(m_mode == Mode::UTF8);
if (is_empty())
return ByteString::empty();
return ByteString((char const*)data(), length());
@ -165,6 +235,7 @@ ByteString StringBuilder::to_byte_string() const
ErrorOr<String> StringBuilder::to_string()
{
VERIFY(m_mode == Mode::UTF8);
if (m_buffer.is_inline())
return String::from_utf8(string_view());
return String::from_string_builder({}, *this);
@ -172,6 +243,7 @@ ErrorOr<String> StringBuilder::to_string()
String StringBuilder::to_string_without_validation()
{
VERIFY(m_mode == Mode::UTF8);
if (m_buffer.is_inline())
return String::from_utf8_without_validation(string_view().bytes());
return String::from_string_builder_without_validation({}, *this);
@ -179,47 +251,108 @@ String StringBuilder::to_string_without_validation()
FlyString StringBuilder::to_fly_string_without_validation() const
{
VERIFY(m_mode == Mode::UTF8);
return FlyString::from_utf8_without_validation(string_view().bytes());
}
ErrorOr<FlyString> StringBuilder::to_fly_string() const
{
VERIFY(m_mode == Mode::UTF8);
return FlyString::from_utf8(string_view());
}
Utf16String StringBuilder::to_utf16_string()
{
VERIFY(m_mode == Mode::UTF16);
if (m_buffer.is_inline())
return Utf16String::from_utf16(utf16_string_view());
return Utf16String::from_string_builder({}, *this);
}
Utf16String StringBuilder::to_utf16_string_without_validation()
{
VERIFY(m_mode == Mode::UTF16);
if (m_buffer.is_inline())
return Utf16String::from_utf16_without_validation(utf16_string_view());
return Utf16String::from_string_builder_without_validation({}, *this);
}
u8* StringBuilder::data()
{
return m_buffer.data() + STRING_BASE_PREFIX_SIZE;
return m_buffer.data() + string_builder_prefix_size(m_mode);
}
u8 const* StringBuilder::data() const
{
return m_buffer.data() + STRING_BASE_PREFIX_SIZE;
return m_buffer.data() + string_builder_prefix_size(m_mode);
}
StringView StringBuilder::string_view() const
{
return m_buffer.span().slice(STRING_BASE_PREFIX_SIZE);
VERIFY(m_mode == Mode::UTF8);
return m_buffer.span().slice(string_builder_prefix_size(m_mode));
}
Utf16View StringBuilder::utf16_string_view() const
{
VERIFY(m_mode == Mode::UTF16);
auto view = m_buffer.span().slice(string_builder_prefix_size(m_mode));
return { reinterpret_cast<char16_t const*>(view.data()), view.size() / 2 };
}
void StringBuilder::clear()
{
m_buffer.resize(STRING_BASE_PREFIX_SIZE);
m_buffer.resize(string_builder_prefix_size(m_mode));
}
ErrorOr<void> StringBuilder::try_append_code_point(u32 code_point)
{
auto nwritten = TRY(AK::UnicodeUtils::try_code_point_to_utf8(code_point, [this](char c) { return try_append(c); }));
if (nwritten < 0) {
TRY(try_append(0xef));
TRY(try_append(0xbf));
TRY(try_append(0xbd));
if (!is_unicode(code_point)) {
TRY(try_append_code_point(UnicodeUtils::REPLACEMENT_CODE_POINT));
return {};
}
switch (m_mode) {
case Mode::UTF8:
TRY(AK::UnicodeUtils::try_code_point_to_utf8(code_point, [this](char c) { return try_append(c); }));
break;
case Mode::UTF16:
TRY(AK::UnicodeUtils::try_code_point_to_utf16(code_point, [this](char16_t c) { return m_buffer.try_append(&c, sizeof(c)); }));
break;
}
return {};
}
void StringBuilder::append_code_point(u32 code_point)
{
if (!is_unicode(code_point)) {
append_code_point(UnicodeUtils::REPLACEMENT_CODE_POINT);
return;
}
if (m_mode == Mode::UTF16) {
(void)(will_append(2));
if (code_point <= UnicodeUtils::FIRST_SUPPLEMENTARY_PLANE_CODE_POINT) {
auto code_unit = static_cast<char16_t>(code_point);
m_buffer.append(&code_unit, sizeof(code_unit));
return;
}
(void)(will_append(2));
code_point -= UnicodeUtils::FIRST_SUPPLEMENTARY_PLANE_CODE_POINT;
auto code_unit = static_cast<u16>(UnicodeUtils::HIGH_SURROGATE_MIN | (code_point >> 10));
m_buffer.append(&code_unit, sizeof(code_unit));
code_unit = static_cast<u16>(UnicodeUtils::LOW_SURROGATE_MIN | (code_point & 0x3ff));
m_buffer.append(&code_unit, sizeof(code_unit));
return;
}
if (code_point <= 0x7f) {
m_buffer.append(static_cast<char>(code_point));
} else if (code_point <= 0x07ff) {
@ -231,17 +364,12 @@ void StringBuilder::append_code_point(u32 code_point)
m_buffer.append(static_cast<char>((((code_point >> 12) & 0x0f) | 0xe0)));
m_buffer.append(static_cast<char>((((code_point >> 6) & 0x3f) | 0x80)));
m_buffer.append(static_cast<char>((((code_point >> 0) & 0x3f) | 0x80)));
} else if (code_point <= 0x10ffff) {
} else {
(void)will_append(4);
m_buffer.append(static_cast<char>((((code_point >> 18) & 0x07) | 0xf0)));
m_buffer.append(static_cast<char>((((code_point >> 12) & 0x3f) | 0x80)));
m_buffer.append(static_cast<char>((((code_point >> 6) & 0x3f) | 0x80)));
m_buffer.append(static_cast<char>((((code_point >> 0) & 0x3f) | 0x80)));
} else {
(void)will_append(3);
m_buffer.append(0xef);
m_buffer.append(0xbf);
m_buffer.append(0xbd);
}
}
@ -252,6 +380,15 @@ ErrorOr<void> StringBuilder::try_append(Utf16View const& utf16_view)
if (utf16_view.has_ascii_storage())
return try_append(utf16_view.bytes());
if (m_mode == Mode::UTF16) {
TRY(will_append(utf16_view.length_in_code_units() * 2));
for (size_t i = 0; i < utf16_view.length_in_code_units(); ++i)
TRY(try_append_code_unit(utf16_view.code_unit_at(i)));
return {};
}
auto remaining_view = utf16_view.utf16_span();
auto maximum_utf8_length = UnicodeUtils::maximum_utf8_length_from_utf16(remaining_view);
@ -356,7 +493,7 @@ ErrorOr<void> StringBuilder::try_append_escaped_for_json(StringView string)
return {};
}
auto StringBuilder::leak_buffer_for_string_construction(Badge<Detail::StringData>) -> Optional<Buffer::OutlineBuffer>
auto StringBuilder::leak_buffer_for_string_construction() -> Optional<Buffer::OutlineBuffer>
{
if (auto buffer = m_buffer.leak_outline_buffer({}); buffer.has_value()) {
clear();

View file

@ -16,6 +16,12 @@ namespace AK {
class StringBuilder {
public:
enum class Mode {
UTF8,
UTF16,
};
static constexpr auto DEFAULT_MODE = Mode::UTF8;
static constexpr size_t inline_capacity = 256;
using Buffer = Detail::ByteBuffer<inline_capacity>;
@ -24,36 +30,42 @@ public:
StringBuilder();
explicit StringBuilder(size_t initial_capacity);
explicit StringBuilder(Mode);
StringBuilder(Mode, size_t initial_capacity_in_code_units);
~StringBuilder() = default;
ErrorOr<void> try_append(StringView);
ErrorOr<void> try_append(Utf16View const&);
ErrorOr<void> try_append(Utf32View const&);
ErrorOr<void> try_append_code_point(u32);
ErrorOr<void> try_append(char);
ErrorOr<void> try_append_code_unit(char16_t);
ErrorOr<void> try_append_code_point(u32);
ErrorOr<void> try_append(char const*, size_t);
ErrorOr<void> try_append_repeated(char, size_t);
ErrorOr<void> try_append_repeated(StringView, size_t);
ErrorOr<void> try_append_escaped_for_json(StringView);
template<typename... Parameters>
ErrorOr<void> try_appendff(CheckedFormatString<Parameters...>&& fmtstr, Parameters const&... parameters)
{
VariadicFormatParams<AllowDebugOnlyFormatters::No, Parameters...> variadic_format_params { parameters... };
return vformat(*this, fmtstr.view(), variadic_format_params);
}
ErrorOr<void> try_append(char const*, size_t);
ErrorOr<void> try_append_repeated(char, size_t);
ErrorOr<void> try_append_repeated(StringView, size_t);
ErrorOr<void> try_append_escaped_for_json(StringView);
void append(StringView);
void append(Utf16View const&);
void append(Utf32View const&);
void append(char);
void append_code_unit(char16_t);
void append_code_point(u32);
void append(char const*, size_t);
void appendvf(char const*, va_list);
void append_repeated(char, size_t);
void append_repeated(StringView, size_t);
void append_as_lowercase(char);
void append_escaped_for_json(StringView);
void append_as_lowercase(char);
template<typename... Parameters>
void appendff(CheckedFormatString<Parameters...>&& fmtstr, Parameters const&... parameters)
@ -70,9 +82,13 @@ public:
[[nodiscard]] FlyString to_fly_string_without_validation() const;
ErrorOr<FlyString> to_fly_string() const;
Utf16String to_utf16_string();
Utf16String to_utf16_string_without_validation();
[[nodiscard]] ErrorOr<ByteBuffer> to_byte_buffer() const;
[[nodiscard]] StringView string_view() const;
[[nodiscard]] Utf16View utf16_string_view() const;
void clear();
[[nodiscard]] size_t length() const;
@ -98,16 +114,20 @@ public:
return {};
}
Optional<Buffer::OutlineBuffer> leak_buffer_for_string_construction(Badge<Detail::StringData>);
Optional<Buffer::OutlineBuffer> leak_buffer_for_string_construction(Badge<Detail::StringData>) { return leak_buffer_for_string_construction(); }
Optional<Buffer::OutlineBuffer> leak_buffer_for_string_construction(Badge<Detail::Utf16StringData>) { return leak_buffer_for_string_construction(); }
private:
explicit StringBuilder(Buffer);
StringBuilder(Buffer, Mode);
Optional<Buffer::OutlineBuffer> leak_buffer_for_string_construction();
ErrorOr<void> will_append(size_t);
u8* data();
u8 const* data() const;
Buffer m_buffer;
Mode m_mode { DEFAULT_MODE };
};
}

View file

@ -40,7 +40,7 @@ public:
auto byte_count = builder.length();
VERIFY(byte_count > MAX_SHORT_STRING_BYTE_COUNT);
auto buffer = builder.leak_buffer_for_string_construction({});
auto buffer = builder.leak_buffer_for_string_construction(Badge<StringData> {});
VERIFY(buffer.has_value()); // We should only arrive here if the buffer is outlined.
return adopt_ref(*new (buffer->buffer.data()) StringData(byte_count));

View file

@ -63,6 +63,11 @@ Utf16String Utf16String::from_utf32(Utf32View const& utf32_string)
return Utf16String { Detail::Utf16StringData::from_utf32(utf32_string) };
}
Utf16String Utf16String::from_string_builder_without_validation(StringBuilder& builder)
{
return Utf16String { Detail::Utf16StringData::from_string_builder(builder) };
}
ErrorOr<void> Formatter<Utf16String>::format(FormatBuilder& builder, Utf16String const& utf16_string)
{
if (utf16_string.has_long_utf16_storage())

View file

@ -81,11 +81,50 @@ public:
requires(IsOneOf<RemoveCVReference<T>, Utf16String>)
static Utf16String from_utf16_without_validation(T&&) = delete;
template<typename... Parameters>
ALWAYS_INLINE static Utf16String formatted(CheckedFormatString<Parameters...>&& format, Parameters const&... parameters)
{
StringBuilder builder(StringBuilder::Mode::UTF16);
VariadicFormatParams<AllowDebugOnlyFormatters::No, Parameters...> variadic_format_parameters { parameters... };
MUST(vformat(builder, format.view(), variadic_format_parameters));
return builder.to_utf16_string();
}
template<Arithmetic T>
ALWAYS_INLINE static Utf16String number(T value)
{
return formatted("{}", value);
}
template<class SeparatorType, class CollectionType>
ALWAYS_INLINE static Utf16String join(SeparatorType const& separator, CollectionType const& collection, StringView format = "{}"sv)
{
StringBuilder builder(StringBuilder::Mode::UTF16);
builder.join(separator, collection, format);
return builder.to_utf16_string();
}
ALWAYS_INLINE static Utf16String from_string_builder(Badge<StringBuilder>, StringBuilder& builder)
{
VERIFY(builder.utf16_string_view().validate());
return from_string_builder_without_validation(builder);
}
ALWAYS_INLINE static Utf16String from_string_builder_without_validation(Badge<StringBuilder>, StringBuilder& builder)
{
return from_string_builder_without_validation(builder);
}
private:
ALWAYS_INLINE explicit Utf16String(NonnullRefPtr<Detail::Utf16StringData const> value)
: Utf16StringBase(move(value))
{
}
static Utf16String from_string_builder_without_validation(StringBuilder&);
};
template<>

View file

@ -132,6 +132,32 @@ NonnullRefPtr<Utf16StringData> Utf16StringData::from_utf32(Utf32View const& utf3
return string.release_nonnull();
}
NonnullRefPtr<Utf16StringData> Utf16StringData::from_string_builder(StringBuilder& builder)
{
auto code_unit_length = builder.utf16_string_view().length_in_code_units();
// Due to internal optimizations, we have an explicit maximum string length of 2**63 - 1.
VERIFY(code_unit_length >> Detail::UTF16_FLAG == 0);
auto buffer = builder.leak_buffer_for_string_construction(Badge<Utf16StringData> {});
VERIFY(buffer.has_value()); // We should only arrive here if the buffer is outlined.
auto data = buffer->buffer.slice(offset_of_string_storage(), code_unit_length * 2);
Utf16View view { reinterpret_cast<char16_t const*>(data.data()), data.size() / sizeof(char16_t) };
auto storage_type = view.is_ascii() ? StorageType::ASCII : StorageType::UTF16;
// FIXME: To reduce memory consumption, it would be better for StringBuilder to handle ASCII vs. UTF-16 storage. For
// example, it might store its buffer as ASCII until it comes across a non-ASCII code point, then switch to
// UTF-16. For now, we switch to ASCII here since third-party APIs will often want ASCII text.
if (storage_type == StorageType::ASCII) {
for (size_t i = 0; i < code_unit_length; ++i)
data[i] = static_cast<u8>(view.code_unit_at(i));
}
return adopt_ref(*new (buffer->buffer.data()) Utf16StringData { storage_type, code_unit_length });
}
size_t Utf16StringData::calculate_code_point_length() const
{
ASSERT(!has_ascii_storage());

View file

@ -31,9 +31,15 @@ public:
static NonnullRefPtr<Utf16StringData> from_utf8(StringView, AllowASCIIStorage);
static NonnullRefPtr<Utf16StringData> from_utf16(Utf16View const&);
static NonnullRefPtr<Utf16StringData> from_utf32(Utf32View const&);
static NonnullRefPtr<Utf16StringData> from_string_builder(StringBuilder&);
~Utf16StringData() = default;
[[nodiscard]] static constexpr size_t offset_of_string_storage()
{
return offsetof(Utf16StringData, m_ascii_data);
}
void operator delete(void* ptr)
{
free(ptr);

View file

@ -1280,7 +1280,7 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
VERIFY(position <= string_length);
// 3. Let result be the empty String.
Utf16Data result;
StringBuilder result(StringBuilder::Mode::UTF16);
// 4. Let templateRemainder be replacementTemplate.
auto replace_template_string = TRY(replacement_template.to_utf16_string(vm));
@ -1451,7 +1451,7 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
auto ref_length = ref.length_in_code_units();
// k. Set result to the string-concatenation of result and refReplacement.
result.append(ref_replacement.utf16_span().data(), ref_replacement.length_in_code_units());
result.append(ref_replacement);
// j. Set templateRemainder to the substring of templateRemainder from refLength.
// NOTE: We do this step last because refReplacement may point to templateRemainder.
@ -1459,7 +1459,7 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
}
// 6. Return result.
return MUST(Utf16View { result }.to_utf8());
return MUST(result.utf16_string_view().to_utf8());
}
void DisposeCapability::visit_edges(GC::Cell::Visitor& visitor) const

View file

@ -87,12 +87,11 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
auto inserted_data_result = MUST(AK::utf8_to_utf16(data));
auto after_data = utf16_view.substring_view(offset + count);
Utf16Data full_data;
full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units());
full_data.append(before_data.utf16_span().data(), before_data.length_in_code_units());
full_data.extend(inserted_data_result.data);
full_data.append(after_data.utf16_span().data(), after_data.length_in_code_units());
Utf16View full_view { full_data };
StringBuilder full_data(StringBuilder::Mode::UTF16, before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units());
full_data.append(before_data);
full_data.append(inserted_data_result.data);
full_data.append(after_data);
auto full_view = full_data.utf16_string_view();
bool characters_are_the_same = utf16_view == full_view;
auto old_data = m_data;

View file

@ -235,6 +235,80 @@ TEST_CASE(from_utf32)
}
}
TEST_CASE(formatted)
{
{
auto string = Utf16String::formatted("{}", 42);
EXPECT(!string.is_empty());
EXPECT(string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 2uz);
EXPECT_EQ(string.length_in_code_points(), 2uz);
EXPECT_EQ(string, u"42"sv);
}
{
auto string = Utf16String::number(42);
EXPECT(!string.is_empty());
EXPECT(string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 2uz);
EXPECT_EQ(string.length_in_code_points(), 2uz);
EXPECT_EQ(string, u"42"sv);
}
{
auto string = Utf16String::formatted("whf {} {} {}!", "😀"sv, Utf16View { u"🍕"sv }, 3.14);
EXPECT(!string.is_empty());
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 15uz);
EXPECT_EQ(string.length_in_code_points(), 13uz);
EXPECT_EQ(string, u"whf 😀 🍕 3.14!"sv);
}
{
Array segments {
u"abcdefghijklmnopqrstuvwxyz"sv,
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
u"abcdefghijklmnopqrstuvwxyz"sv,
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
u"abcdefghijklmnopqrstuvwxyz"sv,
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
};
auto string = Utf16String::join(u"--"sv, segments);
EXPECT(!string.is_empty());
EXPECT(string.is_ascii());
EXPECT(string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 166uz);
EXPECT_EQ(string.length_in_code_points(), 166uz);
EXPECT_EQ(string, u"abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ--abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ--abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv);
}
{
Array segments {
u"abcdefghijklmnopqrstuvwxyz"sv,
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
u"\xd83d\xde00"sv,
u"abcdefghijklmnopqrstuvwxyz"sv,
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
u"🍕"sv,
u"abcdefghijklmnopqrstuvwxyz"sv,
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
};
auto string = Utf16String::join(u"--"sv, segments);
EXPECT(!string.is_empty());
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 174uz);
EXPECT_EQ(string.length_in_code_points(), 172uz);
EXPECT_EQ(string, u"abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ--😀--abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ--🍕--abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv);
}
}
TEST_CASE(copy_operations)
{
auto test = [](Utf16String const& string1) {

View file

@ -7,10 +7,11 @@ get_property(CLANG_PLUGINS_COMPILE_OPTIONS_FOR_TESTS GLOBAL PROPERTY CLANG_PLUGI
list(APPEND CLANG_PLUGINS_COMPILE_OPTIONS_FOR_TESTS
-std=c++23
-Wno-user-defined-literals
-Wno-invalid-offsetof
-Wno-literal-range
-Wno-unknown-warning-option
-Wno-unqualified-std-cast-call
-Wno-user-defined-literals
)
# Ensure we always check for invalid function field types regardless of the value of ENABLE_CLANG_PLUGINS_INVALID_FUNCTION_MEMBERS