diff --git a/AK/StringBuilder.cpp b/AK/StringBuilder.cpp index 8b0eed5b44d..03e4f11f68b 100644 --- a/AK/StringBuilder.cpp +++ b/AK/StringBuilder.cpp @@ -98,6 +98,29 @@ inline ErrorOr StringBuilder::will_append(size_t size_in_bytes) return {}; } +ErrorOr StringBuilder::ensure_storage_is_utf16() +{ + if (!exchange(m_utf16_builder_is_ascii, false)) + return {}; + if (is_empty()) + return {}; + + auto ascii_length = this->length(); + TRY(m_buffer.try_resize(m_buffer.size() + ascii_length)); + + Bytes source { data(), ascii_length }; + Span target { reinterpret_cast(data()), ascii_length }; + + for (size_t i = ascii_length; i > 0; --i) { + auto index = i - 1; + + auto ch = static_cast(source[index]); + target.overwrite(index, &ch, sizeof(char16_t)); + } + + return {}; +} + size_t StringBuilder::length() const { return m_buffer.size() - string_builder_prefix_size(m_mode); @@ -122,16 +145,15 @@ ErrorOr StringBuilder::try_append(StringView string) if (string.is_empty()) return {}; - switch (m_mode) { - case StringBuilder::Mode::UTF8: + if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && string.is_ascii())) { TRY(will_append(string.length())); TRY(m_buffer.try_append(string.characters_without_null_termination(), string.length())); - break; - case StringBuilder::Mode::UTF16: + } else { + TRY(ensure_storage_is_utf16()); + TRY(will_append(string.length() * 2)); for (auto code_point : Utf8View { string }) TRY(try_append_code_point(code_point)); - break; } return {}; @@ -139,14 +161,12 @@ ErrorOr StringBuilder::try_append(StringView string) ErrorOr StringBuilder::try_append(char ch) { - switch (m_mode) { - case StringBuilder::Mode::UTF8: + if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && is_ascii(ch))) { TRY(will_append(1)); TRY(m_buffer.try_append(ch)); - break; - case StringBuilder::Mode::UTF16: + } else { + TRY(ensure_storage_is_utf16()); TRY(try_append_code_unit(ch)); - break; } return {}; @@ -154,14 +174,12 @@ ErrorOr StringBuilder::try_append(char ch) ErrorOr StringBuilder::try_append_code_unit(char16_t ch) { - switch (m_mode) { - case StringBuilder::Mode::UTF8: + if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && is_ascii(ch))) { TRY(try_append_code_point(ch)); - break; - case StringBuilder::Mode::UTF16: + } else { + TRY(ensure_storage_is_utf16()); TRY(will_append(2)); TRY(m_buffer.try_append(&ch, sizeof(ch))); - break; } return {}; @@ -169,9 +187,12 @@ ErrorOr StringBuilder::try_append_code_unit(char16_t ch) ErrorOr StringBuilder::try_append_repeated(char ch, size_t n) { - TRY(will_append(n * (m_mode == Mode::UTF8 ? 1 : 2))); + auto append_as_utf8 = m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && is_ascii(ch)); + TRY(will_append(n * (append_as_utf8 ? 1 : 2))); + for (size_t i = 0; i < n; ++i) TRY(try_append(ch)); + return {}; } @@ -180,7 +201,7 @@ ErrorOr StringBuilder::try_append_repeated(StringView string, size_t n) if (string.is_empty()) return {}; - if (m_mode == Mode::UTF8) { + if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && string.is_ascii())) { TRY(will_append(string.length() * n)); } else { auto utf16_length = simdutf::utf16_length_from_utf8(string.characters_without_null_termination(), string.length()); @@ -198,7 +219,7 @@ ErrorOr StringBuilder::try_append_repeated(Utf16View const& string, size_t if (string.is_empty()) return {}; - if (m_mode == Mode::UTF8) { + if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && string.is_ascii())) { if (string.has_ascii_storage()) { TRY(will_append(string.length_in_code_units() * n)); } else { @@ -299,16 +320,12 @@ ErrorOr StringBuilder::to_fly_string() const Utf16String StringBuilder::to_utf16_string() { VERIFY(m_mode == Mode::UTF16); - if (m_buffer.is_inline()) - return Utf16String::from_utf16(utf16_string_view()); return Utf16String::from_string_builder({}, *this); } Utf16String StringBuilder::to_utf16_string_without_validation() { VERIFY(m_mode == Mode::UTF16); - if (m_buffer.is_inline()) - return Utf16String::from_utf16_without_validation(utf16_string_view()); return Utf16String::from_string_builder_without_validation({}, *this); } @@ -333,6 +350,8 @@ Utf16View StringBuilder::utf16_string_view() const VERIFY(m_mode == Mode::UTF16); auto view = m_buffer.span().slice(string_builder_prefix_size(m_mode)); + if (m_utf16_builder_is_ascii) + return { reinterpret_cast(view.data()), view.size() }; return { reinterpret_cast(view.data()), view.size() / 2 }; } @@ -348,13 +367,12 @@ ErrorOr StringBuilder::try_append_code_point(u32 code_point) return {}; } - switch (m_mode) { - case Mode::UTF8: + if (m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && is_ascii(code_point))) { TRY(AK::UnicodeUtils::try_code_point_to_utf8(code_point, [this](char c) { return try_append(c); })); - break; - case Mode::UTF16: + } else { + TRY(ensure_storage_is_utf16()); + TRY(AK::UnicodeUtils::try_code_point_to_utf16(code_point, [this](char16_t c) { return m_buffer.try_append(&c, sizeof(c)); })); - break; } return {}; @@ -367,7 +385,10 @@ void StringBuilder::append_code_point(u32 code_point) return; } - if (m_mode == Mode::UTF16) { + auto append_as_utf8 = m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && is_ascii(code_point)); + + if (!append_as_utf8) { + MUST(ensure_storage_is_utf16()); (void)(will_append(2)); if (code_point < UnicodeUtils::FIRST_SUPPLEMENTARY_PLANE_CODE_POINT) { @@ -415,7 +436,10 @@ ErrorOr StringBuilder::try_append(Utf16View const& utf16_view) if (utf16_view.has_ascii_storage()) return try_append(utf16_view.bytes()); - if (m_mode == Mode::UTF16) { + auto append_as_utf8 = m_mode == Mode::UTF8 || (m_utf16_builder_is_ascii && utf16_view.is_ascii()); + + if (!append_as_utf8) { + TRY(ensure_storage_is_utf16()); TRY(will_append(utf16_view.length_in_code_units() * 2)); for (size_t i = 0; i < utf16_view.length_in_code_units(); ++i) diff --git a/AK/StringBuilder.h b/AK/StringBuilder.h index 21d80b698b9..f625acfc545 100644 --- a/AK/StringBuilder.h +++ b/AK/StringBuilder.h @@ -125,11 +125,14 @@ private: Optional leak_buffer_for_string_construction(); ErrorOr will_append(size_t); + ErrorOr ensure_storage_is_utf16(); + u8* data(); u8 const* data() const; Buffer m_buffer; Mode m_mode { DEFAULT_MODE }; + bool m_utf16_builder_is_ascii { true }; }; } diff --git a/AK/Utf16String.cpp b/AK/Utf16String.cpp index 1ae6405a5c2..a996d4cec91 100644 --- a/AK/Utf16String.cpp +++ b/AK/Utf16String.cpp @@ -86,6 +86,23 @@ Utf16String Utf16String::from_utf32(Utf32View const& utf32_string) return Utf16String { Detail::Utf16StringData::from_utf32(utf32_string) }; } +Utf16String Utf16String::from_string_builder_without_validation(StringBuilder& builder) +{ + auto view = builder.utf16_string_view(); + + if (view.length_in_code_units() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && view.has_ascii_storage()) { + Utf16String string; + string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(view.length_in_code_units()); + + auto result = view.bytes().copy_to(string.m_value.short_ascii_string.storage); + VERIFY(result == view.length_in_code_units()); + + return string; + } + + return Utf16String { Detail::Utf16StringData::from_string_builder(builder) }; +} + ErrorOr Utf16String::from_ipc_stream(Stream& stream, size_t length_in_code_units, bool is_ascii) { if (is_ascii && length_in_code_units <= Detail::MAX_SHORT_STRING_BYTE_COUNT) { @@ -104,11 +121,6 @@ ErrorOr Utf16String::from_ipc_stream(Stream& stream, size_t length_ return Utf16String { TRY(Detail::Utf16StringData::from_ipc_stream(stream, length_in_code_units, is_ascii)) }; } -Utf16String Utf16String::from_string_builder_without_validation(StringBuilder& builder) -{ - return Utf16String { Detail::Utf16StringData::from_string_builder(builder) }; -} - Utf16String Utf16String::repeated(u32 code_point, size_t count) { if (count <= Detail::MAX_SHORT_STRING_BYTE_COUNT && AK::is_ascii(code_point)) { diff --git a/AK/Utf16StringData.cpp b/AK/Utf16StringData.cpp index e2d1102dee0..538d830d163 100644 --- a/AK/Utf16StringData.cpp +++ b/AK/Utf16StringData.cpp @@ -135,28 +135,29 @@ NonnullRefPtr Utf16StringData::from_utf32(Utf32View const& utf3 NonnullRefPtr Utf16StringData::from_string_builder(StringBuilder& builder) { - auto code_unit_length = builder.utf16_string_view().length_in_code_units(); + auto view = builder.utf16_string_view(); - // Due to internal optimizations, we have an explicit maximum string length of 2**63 - 1. - VERIFY(code_unit_length >> Detail::UTF16_FLAG == 0); + auto code_unit_length = view.length_in_code_units(); + VERIFY_UTF16_LENGTH(code_unit_length); - auto buffer = builder.leak_buffer_for_string_construction(Badge {}); - VERIFY(buffer.has_value()); // We should only arrive here if the buffer is outlined. + RefPtr string; - auto data = buffer->buffer.slice(offset_of_string_storage(), code_unit_length * 2); + if (auto buffer = builder.leak_buffer_for_string_construction(Badge {}); buffer.has_value()) { + auto storage_type = view.has_ascii_storage() ? StorageType::ASCII : StorageType::UTF16; + string = adopt_ref(*new (buffer->buffer.data()) Utf16StringData { storage_type, code_unit_length }); + } else { + if (view.has_ascii_storage()) { + string = create_uninitialized(StorageType::ASCII, code_unit_length); + TypedTransfer::copy(string->m_ascii_data, view.ascii_span().data(), code_unit_length); + } else { + string = create_uninitialized(StorageType::UTF16, code_unit_length); + TypedTransfer::copy(string->m_utf16_data, view.utf16_span().data(), code_unit_length); - Utf16View view { reinterpret_cast(data.data()), data.size() / sizeof(char16_t) }; - auto storage_type = view.is_ascii() ? StorageType::ASCII : StorageType::UTF16; - - // FIXME: To reduce memory consumption, it would be better for StringBuilder to handle ASCII vs. UTF-16 storage. For - // example, it might store its buffer as ASCII until it comes across a non-ASCII code point, then switch to - // UTF-16. For now, we switch to ASCII here since third-party APIs will often want ASCII text. - if (storage_type == StorageType::ASCII) { - for (size_t i = 0; i < code_unit_length; ++i) - data[i] = static_cast(view.code_unit_at(i)); + string->m_length_in_code_points = view.m_length_in_code_points; + } } - return adopt_ref(*new (buffer->buffer.data()) Utf16StringData { storage_type, code_unit_length }); + return string.release_nonnull(); } ErrorOr> Utf16StringData::from_ipc_stream(Stream& stream, size_t length_in_code_units, bool is_ascii) diff --git a/AK/Utf16View.h b/AK/Utf16View.h index 7703334a607..857d3086872 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -658,6 +658,7 @@ public: } private: + friend StringBuilder; friend Detail::Utf16StringBase; friend Detail::Utf16StringData;