AK: Optimize the UTF-16 StringBuilder for ASCII storage

When we build a UTF-16 string, we currently always switch to the UTF-16 storage mode inside StringBuilder. Then when it comes time to create the string, we switch the storage to ASCII if possible (by shifting the underlying bytes up). Instead, let's start out with ASCII storage and then switch to UTF-16 storage once we see a non-ASCII code point. For most strings, this will avoid allocating 2x the memory, and avoids many ASCII validation calls.
Author: https://github.com/trflynn89 Commit: 36c7302178 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5762
2025-10-27 10:29:23 +00:00 · 2025-08-08 15:41:31 -04:00 · 2025-08-08 15:41:31 -04:00 · 36c7302178 · 2025-08-13 13:57:50 +00:00
commit 36c7302178
parent 99d7e08dff
5 changed files with 91 additions and 50 deletions
--- a/AK/Utf16StringData.cpp
+++ b/AK/Utf16StringData.cpp
@ -135,28 +135,29 @@ NonnullRefPtr<Utf16StringData> Utf16StringData::from_utf32(Utf32View const& utf3

 NonnullRefPtr<Utf16StringData> Utf16StringData::from_string_builder(StringBuilder& builder)
 {
-    auto code_unit_length = builder.utf16_string_view().length_in_code_units();
+    auto view = builder.utf16_string_view();

-    // Due to internal optimizations, we have an explicit maximum string length of 2**63 - 1.
-    VERIFY(code_unit_length >> Detail::UTF16_FLAG == 0);
+    auto code_unit_length = view.length_in_code_units();
+    VERIFY_UTF16_LENGTH(code_unit_length);

-    auto buffer = builder.leak_buffer_for_string_construction(Badge<Utf16StringData> {});
-    VERIFY(buffer.has_value()); // We should only arrive here if the buffer is outlined.
+    RefPtr<Utf16StringData> string;

-    auto data = buffer->buffer.slice(offset_of_string_storage(), code_unit_length * 2);
+    if (auto buffer = builder.leak_buffer_for_string_construction(Badge<Utf16StringData> {}); buffer.has_value()) {
+        auto storage_type = view.has_ascii_storage() ? StorageType::ASCII : StorageType::UTF16;
+        string = adopt_ref(*new (buffer->buffer.data()) Utf16StringData { storage_type, code_unit_length });
+    } else {
+        if (view.has_ascii_storage()) {
+            string = create_uninitialized(StorageType::ASCII, code_unit_length);
+            TypedTransfer<char>::copy(string->m_ascii_data, view.ascii_span().data(), code_unit_length);
+        } else {
+            string = create_uninitialized(StorageType::UTF16, code_unit_length);
+            TypedTransfer<char16_t>::copy(string->m_utf16_data, view.utf16_span().data(), code_unit_length);

-    Utf16View view { reinterpret_cast<char16_t const*>(data.data()), data.size() / sizeof(char16_t) };
-    auto storage_type = view.is_ascii() ? StorageType::ASCII : StorageType::UTF16;
-
-    // FIXME: To reduce memory consumption, it would be better for StringBuilder to handle ASCII vs. UTF-16 storage. For
-    //        example, it might store its buffer as ASCII until it comes across a non-ASCII code point, then switch to
-    //        UTF-16. For now, we switch to ASCII here since third-party APIs will often want ASCII text.
-    if (storage_type == StorageType::ASCII) {
-        for (size_t i = 0; i < code_unit_length; ++i)
-            data[i] = static_cast<u8>(view.code_unit_at(i));
+            string->m_length_in_code_points = view.m_length_in_code_points;
+        }
    }

-    return adopt_ref(*new (buffer->buffer.data()) Utf16StringData { storage_type, code_unit_length });
+    return string.release_nonnull();
 }

 ErrorOr<NonnullRefPtr<Utf16StringData>> Utf16StringData::from_ipc_stream(Stream& stream, size_t length_in_code_units, bool is_ascii)