mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-10-27 10:29:23 +00:00
AK: Optimize the UTF-16 StringBuilder for ASCII storage
When we build a UTF-16 string, we currently always switch to the UTF-16 storage mode inside StringBuilder. Then when it comes time to create the string, we switch the storage to ASCII if possible (by shifting the underlying bytes up). Instead, let's start out with ASCII storage and then switch to UTF-16 storage once we see a non-ASCII code point. For most strings, this will avoid allocating 2x the memory, and avoids many ASCII validation calls.
This commit is contained in:
parent
99d7e08dff
commit
36c7302178
Notes:
github-actions[bot]
2025-08-13 13:57:50 +00:00
Author: https://github.com/trflynn89
Commit: 36c7302178
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5762
5 changed files with 91 additions and 50 deletions
|
|
@ -135,28 +135,29 @@ NonnullRefPtr<Utf16StringData> Utf16StringData::from_utf32(Utf32View const& utf3
|
|||
|
||||
NonnullRefPtr<Utf16StringData> Utf16StringData::from_string_builder(StringBuilder& builder)
|
||||
{
|
||||
auto code_unit_length = builder.utf16_string_view().length_in_code_units();
|
||||
auto view = builder.utf16_string_view();
|
||||
|
||||
// Due to internal optimizations, we have an explicit maximum string length of 2**63 - 1.
|
||||
VERIFY(code_unit_length >> Detail::UTF16_FLAG == 0);
|
||||
auto code_unit_length = view.length_in_code_units();
|
||||
VERIFY_UTF16_LENGTH(code_unit_length);
|
||||
|
||||
auto buffer = builder.leak_buffer_for_string_construction(Badge<Utf16StringData> {});
|
||||
VERIFY(buffer.has_value()); // We should only arrive here if the buffer is outlined.
|
||||
RefPtr<Utf16StringData> string;
|
||||
|
||||
auto data = buffer->buffer.slice(offset_of_string_storage(), code_unit_length * 2);
|
||||
if (auto buffer = builder.leak_buffer_for_string_construction(Badge<Utf16StringData> {}); buffer.has_value()) {
|
||||
auto storage_type = view.has_ascii_storage() ? StorageType::ASCII : StorageType::UTF16;
|
||||
string = adopt_ref(*new (buffer->buffer.data()) Utf16StringData { storage_type, code_unit_length });
|
||||
} else {
|
||||
if (view.has_ascii_storage()) {
|
||||
string = create_uninitialized(StorageType::ASCII, code_unit_length);
|
||||
TypedTransfer<char>::copy(string->m_ascii_data, view.ascii_span().data(), code_unit_length);
|
||||
} else {
|
||||
string = create_uninitialized(StorageType::UTF16, code_unit_length);
|
||||
TypedTransfer<char16_t>::copy(string->m_utf16_data, view.utf16_span().data(), code_unit_length);
|
||||
|
||||
Utf16View view { reinterpret_cast<char16_t const*>(data.data()), data.size() / sizeof(char16_t) };
|
||||
auto storage_type = view.is_ascii() ? StorageType::ASCII : StorageType::UTF16;
|
||||
|
||||
// FIXME: To reduce memory consumption, it would be better for StringBuilder to handle ASCII vs. UTF-16 storage. For
|
||||
// example, it might store its buffer as ASCII until it comes across a non-ASCII code point, then switch to
|
||||
// UTF-16. For now, we switch to ASCII here since third-party APIs will often want ASCII text.
|
||||
if (storage_type == StorageType::ASCII) {
|
||||
for (size_t i = 0; i < code_unit_length; ++i)
|
||||
data[i] = static_cast<u8>(view.code_unit_at(i));
|
||||
string->m_length_in_code_points = view.m_length_in_code_points;
|
||||
}
|
||||
}
|
||||
|
||||
return adopt_ref(*new (buffer->buffer.data()) Utf16StringData { storage_type, code_unit_length });
|
||||
return string.release_nonnull();
|
||||
}
|
||||
|
||||
ErrorOr<NonnullRefPtr<Utf16StringData>> Utf16StringData::from_ipc_stream(Stream& stream, size_t length_in_code_units, bool is_ascii)
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue