From 36c7302178aee1b47a653f37f561496327b67a23 Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Fri, 8 Aug 2025 15:41:31 -0400 Subject: [PATCH] AK: Optimize the UTF-16 StringBuilder for ASCII storage When we build a UTF-16 string, we currently always switch to the UTF-16 storage mode inside StringBuilder. Then when it comes time to create the string, we switch the storage to ASCII if possible (by shifting the underlying bytes up). Instead, let's start out with ASCII storage and then switch to UTF-16 storage once we see a non-ASCII code point. For most strings, this will avoid allocating 2x the memory, and avoids many ASCII validation calls. --- AK/StringBuilder.cpp | 82 +++++++++++++++++++++++++++--------------- AK/StringBuilder.h | 3 ++ AK/Utf16String.cpp | 22 +++++++++--- AK/Utf16StringData.cpp | 33 ++++++++--------- AK/Utf16View.h | 1 + 5 files changed, 91 insertions(+), 50 deletions(-) diff --git a/AK/StringBuilder.cpp b/AK/StringBuilder.cpp index 8b0eed5b44d..03e4f11f68b 100644 --- a/AK/StringBuilder.cpp +++ b/AK/StringBuilder.cpp @@ -98,6 +98,29 @@ inline ErrorOr StringBuilder::will_append(size_t size_in_bytes) return {}; } +ErrorOr StringBuilder::ensure_storage_is_utf16() +{ + if (!exchange(m_utf16_builder_is_ascii, false)) + return {}; + if (is_empty()) + return {}; + + auto ascii_length = this->length(); + TRY(m_buffer.try_resize(m_buffer.size() + ascii_length)); + + Bytes source { data(), ascii_length }; + Span target { reinterpret_cast(data()), ascii_length }; + + for (size_t i = ascii_length; i > 0; --i) { + auto index = i - 1; + + auto ch = static_cast(source[index]); + target.overwrite(index, &ch, sizeof(char16_t)); + } + + return {}; +} + size_t StringBuilder::length() const { return m_buffer.size() - string_builder_prefix_size(m_mode); @@ -122,16 +145,15 @@ ErrorOr StringBuilder::try_append(StringView string) if (string.is_empty()) return {}; - switch (m_mode) { - case StringBuilder::Mode::UTF8: + if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && string.is_ascii())) { TRY(will_append(string.length())); TRY(m_buffer.try_append(string.characters_without_null_termination(), string.length())); - break; - case StringBuilder::Mode::UTF16: + } else { + TRY(ensure_storage_is_utf16()); + TRY(will_append(string.length() * 2)); for (auto code_point : Utf8View { string }) TRY(try_append_code_point(code_point)); - break; } return {}; @@ -139,14 +161,12 @@ ErrorOr StringBuilder::try_append(StringView string) ErrorOr StringBuilder::try_append(char ch) { - switch (m_mode) { - case StringBuilder::Mode::UTF8: + if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && is_ascii(ch))) { TRY(will_append(1)); TRY(m_buffer.try_append(ch)); - break; - case StringBuilder::Mode::UTF16: + } else { + TRY(ensure_storage_is_utf16()); TRY(try_append_code_unit(ch)); - break; } return {}; @@ -154,14 +174,12 @@ ErrorOr StringBuilder::try_append(char ch) ErrorOr StringBuilder::try_append_code_unit(char16_t ch) { - switch (m_mode) { - case StringBuilder::Mode::UTF8: + if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && is_ascii(ch))) { TRY(try_append_code_point(ch)); - break; - case StringBuilder::Mode::UTF16: + } else { + TRY(ensure_storage_is_utf16()); TRY(will_append(2)); TRY(m_buffer.try_append(&ch, sizeof(ch))); - break; } return {}; @@ -169,9 +187,12 @@ ErrorOr StringBuilder::try_append_code_unit(char16_t ch) ErrorOr StringBuilder::try_append_repeated(char ch, size_t n) { - TRY(will_append(n * (m_mode == Mode::UTF8 ? 1 : 2))); + auto append_as_utf8 = m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && is_ascii(ch)); + TRY(will_append(n * (append_as_utf8 ? 1 : 2))); + for (size_t i = 0; i < n; ++i) TRY(try_append(ch)); + return {}; } @@ -180,7 +201,7 @@ ErrorOr StringBuilder::try_append_repeated(StringView string, size_t n) if (string.is_empty()) return {}; - if (m_mode == Mode::UTF8) { + if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && string.is_ascii())) { TRY(will_append(string.length() * n)); } else { auto utf16_length = simdutf::utf16_length_from_utf8(string.characters_without_null_termination(), string.length()); @@ -198,7 +219,7 @@ ErrorOr StringBuilder::try_append_repeated(Utf16View const& string, size_t if (string.is_empty()) return {}; - if (m_mode == Mode::UTF8) { + if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && string.is_ascii())) { if (string.has_ascii_storage()) { TRY(will_append(string.length_in_code_units() * n)); } else { @@ -299,16 +320,12 @@ ErrorOr StringBuilder::to_fly_string() const Utf16String StringBuilder::to_utf16_string() { VERIFY(m_mode == Mode::UTF16); - if (m_buffer.is_inline()) - return Utf16String::from_utf16(utf16_string_view()); return Utf16String::from_string_builder({}, *this); } Utf16String StringBuilder::to_utf16_string_without_validation() { VERIFY(m_mode == Mode::UTF16); - if (m_buffer.is_inline()) - return Utf16String::from_utf16_without_validation(utf16_string_view()); return Utf16String::from_string_builder_without_validation({}, *this); } @@ -333,6 +350,8 @@ Utf16View StringBuilder::utf16_string_view() const VERIFY(m_mode == Mode::UTF16); auto view = m_buffer.span().slice(string_builder_prefix_size(m_mode)); + if (m_utf16_builder_is_ascii) + return { reinterpret_cast(view.data()), view.size() }; return { reinterpret_cast(view.data()), view.size() / 2 }; } @@ -348,13 +367,12 @@ ErrorOr StringBuilder::try_append_code_point(u32 code_point) return {}; } - switch (m_mode) { - case Mode::UTF8: + if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && is_ascii(code_point))) { TRY(AK::UnicodeUtils::try_code_point_to_utf8(code_point, [this](char c) { return try_append(c); })); - break; - case Mode::UTF16: + } else { + TRY(ensure_storage_is_utf16()); + TRY(AK::UnicodeUtils::try_code_point_to_utf16(code_point, [this](char16_t c) { return m_buffer.try_append(&c, sizeof(c)); })); - break; } return {}; @@ -367,7 +385,10 @@ void StringBuilder::append_code_point(u32 code_point) return; } - if (m_mode == Mode::UTF16) { + auto append_as_utf8 = m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && is_ascii(code_point)); + + if (!append_as_utf8) { + MUST(ensure_storage_is_utf16()); (void)(will_append(2)); if (code_point < UnicodeUtils::FIRST_SUPPLEMENTARY_PLANE_CODE_POINT) { @@ -415,7 +436,10 @@ ErrorOr StringBuilder::try_append(Utf16View const& utf16_view) if (utf16_view.has_ascii_storage()) return try_append(utf16_view.bytes()); - if (m_mode == Mode::UTF16) { + auto append_as_utf8 = m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && utf16_view.is_ascii()); + + if (!append_as_utf8) { + TRY(ensure_storage_is_utf16()); TRY(will_append(utf16_view.length_in_code_units() * 2)); for (size_t i = 0; i < utf16_view.length_in_code_units(); ++i) diff --git a/AK/StringBuilder.h b/AK/StringBuilder.h index 21d80b698b9..f625acfc545 100644 --- a/AK/StringBuilder.h +++ b/AK/StringBuilder.h @@ -125,11 +125,14 @@ private: Optional leak_buffer_for_string_construction(); ErrorOr will_append(size_t); + ErrorOr ensure_storage_is_utf16(); + u8* data(); u8 const* data() const; Buffer m_buffer; Mode m_mode { DEFAULT_MODE }; + bool m_utf16_builder_is_ascii { true }; }; } diff --git a/AK/Utf16String.cpp b/AK/Utf16String.cpp index 1ae6405a5c2..a996d4cec91 100644 --- a/AK/Utf16String.cpp +++ b/AK/Utf16String.cpp @@ -86,6 +86,23 @@ Utf16String Utf16String::from_utf32(Utf32View const& utf32_string) return Utf16String { Detail::Utf16StringData::from_utf32(utf32_string) }; } +Utf16String Utf16String::from_string_builder_without_validation(StringBuilder& builder) +{ + auto view = builder.utf16_string_view(); + + if (view.length_in_code_units() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && view.has_ascii_storage()) { + Utf16String string; + string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(view.length_in_code_units()); + + auto result = view.bytes().copy_to(string.m_value.short_ascii_string.storage); + VERIFY(result == view.length_in_code_units()); + + return string; + } + + return Utf16String { Detail::Utf16StringData::from_string_builder(builder) }; +} + ErrorOr Utf16String::from_ipc_stream(Stream& stream, size_t length_in_code_units, bool is_ascii) { if (is_ascii && length_in_code_units <= Detail::MAX_SHORT_STRING_BYTE_COUNT) { @@ -104,11 +121,6 @@ ErrorOr Utf16String::from_ipc_stream(Stream& stream, size_t length_ return Utf16String { TRY(Detail::Utf16StringData::from_ipc_stream(stream, length_in_code_units, is_ascii)) }; } -Utf16String Utf16String::from_string_builder_without_validation(StringBuilder& builder) -{ - return Utf16String { Detail::Utf16StringData::from_string_builder(builder) }; -} - Utf16String Utf16String::repeated(u32 code_point, size_t count) { if (count <= Detail::MAX_SHORT_STRING_BYTE_COUNT && AK::is_ascii(code_point)) { diff --git a/AK/Utf16StringData.cpp b/AK/Utf16StringData.cpp index e2d1102dee0..538d830d163 100644 --- a/AK/Utf16StringData.cpp +++ b/AK/Utf16StringData.cpp @@ -135,28 +135,29 @@ NonnullRefPtr Utf16StringData::from_utf32(Utf32View const& utf3 NonnullRefPtr Utf16StringData::from_string_builder(StringBuilder& builder) { - auto code_unit_length = builder.utf16_string_view().length_in_code_units(); + auto view = builder.utf16_string_view(); - // Due to internal optimizations, we have an explicit maximum string length of 2**63 - 1. - VERIFY(code_unit_length >> Detail::UTF16_FLAG == 0); + auto code_unit_length = view.length_in_code_units(); + VERIFY_UTF16_LENGTH(code_unit_length); - auto buffer = builder.leak_buffer_for_string_construction(Badge {}); - VERIFY(buffer.has_value()); // We should only arrive here if the buffer is outlined. + RefPtr string; - auto data = buffer->buffer.slice(offset_of_string_storage(), code_unit_length * 2); + if (auto buffer = builder.leak_buffer_for_string_construction(Badge {}); buffer.has_value()) { + auto storage_type = view.has_ascii_storage() ? StorageType::ASCII : StorageType::UTF16; + string = adopt_ref(*new (buffer->buffer.data()) Utf16StringData { storage_type, code_unit_length }); + } else { + if (view.has_ascii_storage()) { + string = create_uninitialized(StorageType::ASCII, code_unit_length); + TypedTransfer::copy(string->m_ascii_data, view.ascii_span().data(), code_unit_length); + } else { + string = create_uninitialized(StorageType::UTF16, code_unit_length); + TypedTransfer::copy(string->m_utf16_data, view.utf16_span().data(), code_unit_length); - Utf16View view { reinterpret_cast(data.data()), data.size() / sizeof(char16_t) }; - auto storage_type = view.is_ascii() ? StorageType::ASCII : StorageType::UTF16; - - // FIXME: To reduce memory consumption, it would be better for StringBuilder to handle ASCII vs. UTF-16 storage. For - // example, it might store its buffer as ASCII until it comes across a non-ASCII code point, then switch to - // UTF-16. For now, we switch to ASCII here since third-party APIs will often want ASCII text. - if (storage_type == StorageType::ASCII) { - for (size_t i = 0; i < code_unit_length; ++i) - data[i] = static_cast(view.code_unit_at(i)); + string->m_length_in_code_points = view.m_length_in_code_points; + } } - return adopt_ref(*new (buffer->buffer.data()) Utf16StringData { storage_type, code_unit_length }); + return string.release_nonnull(); } ErrorOr> Utf16StringData::from_ipc_stream(Stream& stream, size_t length_in_code_units, bool is_ascii) diff --git a/AK/Utf16View.h b/AK/Utf16View.h index 7703334a607..857d3086872 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -658,6 +658,7 @@ public: } private: + friend StringBuilder; friend Detail::Utf16StringBase; friend Detail::Utf16StringData;