mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-08-27 04:37:22 +00:00
AK: Optimize the UTF-16 StringBuilder for ASCII storage
When we build a UTF-16 string, we currently always switch to the UTF-16 storage mode inside StringBuilder. Then when it comes time to create the string, we switch the storage to ASCII if possible (by shifting the underlying bytes up). Instead, let's start out with ASCII storage and then switch to UTF-16 storage once we see a non-ASCII code point. For most strings, this will avoid allocating 2x the memory, and avoids many ASCII validation calls.
This commit is contained in:
parent
99d7e08dff
commit
36c7302178
Notes:
github-actions[bot]
2025-08-13 13:57:50 +00:00
Author: https://github.com/trflynn89
Commit: 36c7302178
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5762
5 changed files with 91 additions and 50 deletions
|
@ -98,6 +98,29 @@ inline ErrorOr<void> StringBuilder::will_append(size_t size_in_bytes)
|
|||
return {};
|
||||
}
|
||||
|
||||
ErrorOr<void> StringBuilder::ensure_storage_is_utf16()
|
||||
{
|
||||
if (!exchange(m_utf16_builder_is_ascii, false))
|
||||
return {};
|
||||
if (is_empty())
|
||||
return {};
|
||||
|
||||
auto ascii_length = this->length();
|
||||
TRY(m_buffer.try_resize(m_buffer.size() + ascii_length));
|
||||
|
||||
Bytes source { data(), ascii_length };
|
||||
Span<char16_t> target { reinterpret_cast<char16_t*>(data()), ascii_length };
|
||||
|
||||
for (size_t i = ascii_length; i > 0; --i) {
|
||||
auto index = i - 1;
|
||||
|
||||
auto ch = static_cast<char16_t>(source[index]);
|
||||
target.overwrite(index, &ch, sizeof(char16_t));
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
size_t StringBuilder::length() const
|
||||
{
|
||||
return m_buffer.size() - string_builder_prefix_size(m_mode);
|
||||
|
@ -122,16 +145,15 @@ ErrorOr<void> StringBuilder::try_append(StringView string)
|
|||
if (string.is_empty())
|
||||
return {};
|
||||
|
||||
switch (m_mode) {
|
||||
case StringBuilder::Mode::UTF8:
|
||||
if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && string.is_ascii())) {
|
||||
TRY(will_append(string.length()));
|
||||
TRY(m_buffer.try_append(string.characters_without_null_termination(), string.length()));
|
||||
break;
|
||||
case StringBuilder::Mode::UTF16:
|
||||
} else {
|
||||
TRY(ensure_storage_is_utf16());
|
||||
|
||||
TRY(will_append(string.length() * 2));
|
||||
for (auto code_point : Utf8View { string })
|
||||
TRY(try_append_code_point(code_point));
|
||||
break;
|
||||
}
|
||||
|
||||
return {};
|
||||
|
@ -139,14 +161,12 @@ ErrorOr<void> StringBuilder::try_append(StringView string)
|
|||
|
||||
ErrorOr<void> StringBuilder::try_append(char ch)
|
||||
{
|
||||
switch (m_mode) {
|
||||
case StringBuilder::Mode::UTF8:
|
||||
if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && is_ascii(ch))) {
|
||||
TRY(will_append(1));
|
||||
TRY(m_buffer.try_append(ch));
|
||||
break;
|
||||
case StringBuilder::Mode::UTF16:
|
||||
} else {
|
||||
TRY(ensure_storage_is_utf16());
|
||||
TRY(try_append_code_unit(ch));
|
||||
break;
|
||||
}
|
||||
|
||||
return {};
|
||||
|
@ -154,14 +174,12 @@ ErrorOr<void> StringBuilder::try_append(char ch)
|
|||
|
||||
ErrorOr<void> StringBuilder::try_append_code_unit(char16_t ch)
|
||||
{
|
||||
switch (m_mode) {
|
||||
case StringBuilder::Mode::UTF8:
|
||||
if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && is_ascii(ch))) {
|
||||
TRY(try_append_code_point(ch));
|
||||
break;
|
||||
case StringBuilder::Mode::UTF16:
|
||||
} else {
|
||||
TRY(ensure_storage_is_utf16());
|
||||
TRY(will_append(2));
|
||||
TRY(m_buffer.try_append(&ch, sizeof(ch)));
|
||||
break;
|
||||
}
|
||||
|
||||
return {};
|
||||
|
@ -169,9 +187,12 @@ ErrorOr<void> StringBuilder::try_append_code_unit(char16_t ch)
|
|||
|
||||
ErrorOr<void> StringBuilder::try_append_repeated(char ch, size_t n)
|
||||
{
|
||||
TRY(will_append(n * (m_mode == Mode::UTF8 ? 1 : 2)));
|
||||
auto append_as_utf8 = m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && is_ascii(ch));
|
||||
TRY(will_append(n * (append_as_utf8 ? 1 : 2)));
|
||||
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
TRY(try_append(ch));
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
|
@ -180,7 +201,7 @@ ErrorOr<void> StringBuilder::try_append_repeated(StringView string, size_t n)
|
|||
if (string.is_empty())
|
||||
return {};
|
||||
|
||||
if (m_mode == Mode::UTF8) {
|
||||
if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && string.is_ascii())) {
|
||||
TRY(will_append(string.length() * n));
|
||||
} else {
|
||||
auto utf16_length = simdutf::utf16_length_from_utf8(string.characters_without_null_termination(), string.length());
|
||||
|
@ -198,7 +219,7 @@ ErrorOr<void> StringBuilder::try_append_repeated(Utf16View const& string, size_t
|
|||
if (string.is_empty())
|
||||
return {};
|
||||
|
||||
if (m_mode == Mode::UTF8) {
|
||||
if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && string.is_ascii())) {
|
||||
if (string.has_ascii_storage()) {
|
||||
TRY(will_append(string.length_in_code_units() * n));
|
||||
} else {
|
||||
|
@ -299,16 +320,12 @@ ErrorOr<FlyString> StringBuilder::to_fly_string() const
|
|||
Utf16String StringBuilder::to_utf16_string()
|
||||
{
|
||||
VERIFY(m_mode == Mode::UTF16);
|
||||
if (m_buffer.is_inline())
|
||||
return Utf16String::from_utf16(utf16_string_view());
|
||||
return Utf16String::from_string_builder({}, *this);
|
||||
}
|
||||
|
||||
Utf16String StringBuilder::to_utf16_string_without_validation()
|
||||
{
|
||||
VERIFY(m_mode == Mode::UTF16);
|
||||
if (m_buffer.is_inline())
|
||||
return Utf16String::from_utf16_without_validation(utf16_string_view());
|
||||
return Utf16String::from_string_builder_without_validation({}, *this);
|
||||
}
|
||||
|
||||
|
@ -333,6 +350,8 @@ Utf16View StringBuilder::utf16_string_view() const
|
|||
VERIFY(m_mode == Mode::UTF16);
|
||||
auto view = m_buffer.span().slice(string_builder_prefix_size(m_mode));
|
||||
|
||||
if (m_utf16_builder_is_ascii)
|
||||
return { reinterpret_cast<char const*>(view.data()), view.size() };
|
||||
return { reinterpret_cast<char16_t const*>(view.data()), view.size() / 2 };
|
||||
}
|
||||
|
||||
|
@ -348,13 +367,12 @@ ErrorOr<void> StringBuilder::try_append_code_point(u32 code_point)
|
|||
return {};
|
||||
}
|
||||
|
||||
switch (m_mode) {
|
||||
case Mode::UTF8:
|
||||
if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && is_ascii(code_point))) {
|
||||
TRY(AK::UnicodeUtils::try_code_point_to_utf8(code_point, [this](char c) { return try_append(c); }));
|
||||
break;
|
||||
case Mode::UTF16:
|
||||
} else {
|
||||
TRY(ensure_storage_is_utf16());
|
||||
|
||||
TRY(AK::UnicodeUtils::try_code_point_to_utf16(code_point, [this](char16_t c) { return m_buffer.try_append(&c, sizeof(c)); }));
|
||||
break;
|
||||
}
|
||||
|
||||
return {};
|
||||
|
@ -367,7 +385,10 @@ void StringBuilder::append_code_point(u32 code_point)
|
|||
return;
|
||||
}
|
||||
|
||||
if (m_mode == Mode::UTF16) {
|
||||
auto append_as_utf8 = m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && is_ascii(code_point));
|
||||
|
||||
if (!append_as_utf8) {
|
||||
MUST(ensure_storage_is_utf16());
|
||||
(void)(will_append(2));
|
||||
|
||||
if (code_point < UnicodeUtils::FIRST_SUPPLEMENTARY_PLANE_CODE_POINT) {
|
||||
|
@ -415,7 +436,10 @@ ErrorOr<void> StringBuilder::try_append(Utf16View const& utf16_view)
|
|||
if (utf16_view.has_ascii_storage())
|
||||
return try_append(utf16_view.bytes());
|
||||
|
||||
if (m_mode == Mode::UTF16) {
|
||||
auto append_as_utf8 = m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && utf16_view.is_ascii());
|
||||
|
||||
if (!append_as_utf8) {
|
||||
TRY(ensure_storage_is_utf16());
|
||||
TRY(will_append(utf16_view.length_in_code_units() * 2));
|
||||
|
||||
for (size_t i = 0; i < utf16_view.length_in_code_units(); ++i)
|
||||
|
|
|
@ -125,11 +125,14 @@ private:
|
|||
Optional<Buffer::OutlineBuffer> leak_buffer_for_string_construction();
|
||||
|
||||
ErrorOr<void> will_append(size_t);
|
||||
ErrorOr<void> ensure_storage_is_utf16();
|
||||
|
||||
u8* data();
|
||||
u8 const* data() const;
|
||||
|
||||
Buffer m_buffer;
|
||||
Mode m_mode { DEFAULT_MODE };
|
||||
bool m_utf16_builder_is_ascii { true };
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -86,6 +86,23 @@ Utf16String Utf16String::from_utf32(Utf32View const& utf32_string)
|
|||
return Utf16String { Detail::Utf16StringData::from_utf32(utf32_string) };
|
||||
}
|
||||
|
||||
Utf16String Utf16String::from_string_builder_without_validation(StringBuilder& builder)
|
||||
{
|
||||
auto view = builder.utf16_string_view();
|
||||
|
||||
if (view.length_in_code_units() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && view.has_ascii_storage()) {
|
||||
Utf16String string;
|
||||
string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(view.length_in_code_units());
|
||||
|
||||
auto result = view.bytes().copy_to(string.m_value.short_ascii_string.storage);
|
||||
VERIFY(result == view.length_in_code_units());
|
||||
|
||||
return string;
|
||||
}
|
||||
|
||||
return Utf16String { Detail::Utf16StringData::from_string_builder(builder) };
|
||||
}
|
||||
|
||||
ErrorOr<Utf16String> Utf16String::from_ipc_stream(Stream& stream, size_t length_in_code_units, bool is_ascii)
|
||||
{
|
||||
if (is_ascii && length_in_code_units <= Detail::MAX_SHORT_STRING_BYTE_COUNT) {
|
||||
|
@ -104,11 +121,6 @@ ErrorOr<Utf16String> Utf16String::from_ipc_stream(Stream& stream, size_t length_
|
|||
return Utf16String { TRY(Detail::Utf16StringData::from_ipc_stream(stream, length_in_code_units, is_ascii)) };
|
||||
}
|
||||
|
||||
Utf16String Utf16String::from_string_builder_without_validation(StringBuilder& builder)
|
||||
{
|
||||
return Utf16String { Detail::Utf16StringData::from_string_builder(builder) };
|
||||
}
|
||||
|
||||
Utf16String Utf16String::repeated(u32 code_point, size_t count)
|
||||
{
|
||||
if (count <= Detail::MAX_SHORT_STRING_BYTE_COUNT && AK::is_ascii(code_point)) {
|
||||
|
|
|
@ -135,28 +135,29 @@ NonnullRefPtr<Utf16StringData> Utf16StringData::from_utf32(Utf32View const& utf3
|
|||
|
||||
NonnullRefPtr<Utf16StringData> Utf16StringData::from_string_builder(StringBuilder& builder)
|
||||
{
|
||||
auto code_unit_length = builder.utf16_string_view().length_in_code_units();
|
||||
auto view = builder.utf16_string_view();
|
||||
|
||||
// Due to internal optimizations, we have an explicit maximum string length of 2**63 - 1.
|
||||
VERIFY(code_unit_length >> Detail::UTF16_FLAG == 0);
|
||||
auto code_unit_length = view.length_in_code_units();
|
||||
VERIFY_UTF16_LENGTH(code_unit_length);
|
||||
|
||||
auto buffer = builder.leak_buffer_for_string_construction(Badge<Utf16StringData> {});
|
||||
VERIFY(buffer.has_value()); // We should only arrive here if the buffer is outlined.
|
||||
RefPtr<Utf16StringData> string;
|
||||
|
||||
auto data = buffer->buffer.slice(offset_of_string_storage(), code_unit_length * 2);
|
||||
if (auto buffer = builder.leak_buffer_for_string_construction(Badge<Utf16StringData> {}); buffer.has_value()) {
|
||||
auto storage_type = view.has_ascii_storage() ? StorageType::ASCII : StorageType::UTF16;
|
||||
string = adopt_ref(*new (buffer->buffer.data()) Utf16StringData { storage_type, code_unit_length });
|
||||
} else {
|
||||
if (view.has_ascii_storage()) {
|
||||
string = create_uninitialized(StorageType::ASCII, code_unit_length);
|
||||
TypedTransfer<char>::copy(string->m_ascii_data, view.ascii_span().data(), code_unit_length);
|
||||
} else {
|
||||
string = create_uninitialized(StorageType::UTF16, code_unit_length);
|
||||
TypedTransfer<char16_t>::copy(string->m_utf16_data, view.utf16_span().data(), code_unit_length);
|
||||
|
||||
Utf16View view { reinterpret_cast<char16_t const*>(data.data()), data.size() / sizeof(char16_t) };
|
||||
auto storage_type = view.is_ascii() ? StorageType::ASCII : StorageType::UTF16;
|
||||
|
||||
// FIXME: To reduce memory consumption, it would be better for StringBuilder to handle ASCII vs. UTF-16 storage. For
|
||||
// example, it might store its buffer as ASCII until it comes across a non-ASCII code point, then switch to
|
||||
// UTF-16. For now, we switch to ASCII here since third-party APIs will often want ASCII text.
|
||||
if (storage_type == StorageType::ASCII) {
|
||||
for (size_t i = 0; i < code_unit_length; ++i)
|
||||
data[i] = static_cast<u8>(view.code_unit_at(i));
|
||||
string->m_length_in_code_points = view.m_length_in_code_points;
|
||||
}
|
||||
}
|
||||
|
||||
return adopt_ref(*new (buffer->buffer.data()) Utf16StringData { storage_type, code_unit_length });
|
||||
return string.release_nonnull();
|
||||
}
|
||||
|
||||
ErrorOr<NonnullRefPtr<Utf16StringData>> Utf16StringData::from_ipc_stream(Stream& stream, size_t length_in_code_units, bool is_ascii)
|
||||
|
|
|
@ -658,6 +658,7 @@ public:
|
|||
}
|
||||
|
||||
private:
|
||||
friend StringBuilder;
|
||||
friend Detail::Utf16StringBase;
|
||||
friend Detail::Utf16StringData;
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue