mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-07-23 17:33:12 +00:00
The underlying storage used during string formatting is StringBuilder. To support UTF-16 strings, this patch allows callers to specify a mode during StringBuilder construction. The default mode is UTF-8, for which StringBuilder remains unchanged. In UTF-16 mode, we treat the StringBuilder's internal ByteBuffer as a series of u16 code units. Appending a single character will append 2 bytes for that character (cast to a char16_t). Appending a StringView will transcode the string to UTF-16. Utf16String also gains the same memory optimization that we added for String, where we hand-off the underlying buffer to Utf16String to avoid having to re-allocate. In the future, we may want to further optimize for ASCII strings. For example, we could defer committing to the u16-esque storage until we see a non-ASCII code point.
78 lines
2.9 KiB
C++
78 lines
2.9 KiB
C++
/*
|
|
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
|
|
*
|
|
* SPDX-License-Identifier: BSD-2-Clause
|
|
*/
|
|
|
|
#include <AK/Utf16String.h>
|
|
#include <AK/Utf32View.h>
|
|
|
|
#include <simdutf.h>
|
|
|
|
namespace AK {
|
|
|
|
static_assert(sizeof(Detail::ShortString) == sizeof(Detail::Utf16StringData*));
|
|
|
|
Utf16String Utf16String::from_utf8_without_validation(StringView utf8_string)
|
|
{
|
|
if (utf8_string.length() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf8_string.is_ascii()) {
|
|
Utf16String string;
|
|
string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(utf8_string.length());
|
|
|
|
auto result = utf8_string.bytes().copy_to(string.m_value.short_ascii_string.storage);
|
|
VERIFY(result == utf8_string.length());
|
|
|
|
return string;
|
|
}
|
|
|
|
return Utf16String { Detail::Utf16StringData::from_utf8(utf8_string, Detail::Utf16StringData::AllowASCIIStorage::Yes) };
|
|
}
|
|
|
|
Utf16String Utf16String::from_utf16_without_validation(Utf16View const& utf16_string)
|
|
{
|
|
if (utf16_string.length_in_code_units() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf16_string.is_ascii()) {
|
|
Utf16String string;
|
|
string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(utf16_string.length_in_code_units());
|
|
|
|
if (utf16_string.has_ascii_storage()) {
|
|
auto result = utf16_string.bytes().copy_to(string.m_value.short_ascii_string.storage);
|
|
VERIFY(result == utf16_string.length_in_code_units());
|
|
} else {
|
|
auto result = simdutf::convert_utf16_to_utf8(utf16_string.utf16_span().data(), utf16_string.length_in_code_units(), reinterpret_cast<char*>(string.m_value.short_ascii_string.storage));
|
|
VERIFY(result == utf16_string.length_in_code_units());
|
|
}
|
|
|
|
return string;
|
|
}
|
|
|
|
return Utf16String { Detail::Utf16StringData::from_utf16(utf16_string) };
|
|
}
|
|
|
|
Utf16String Utf16String::from_utf32(Utf32View const& utf32_string)
|
|
{
|
|
if (utf32_string.length() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf32_string.is_ascii()) {
|
|
Utf16String string;
|
|
string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(utf32_string.length());
|
|
|
|
auto result = simdutf::convert_utf32_to_utf8(reinterpret_cast<char32_t const*>(utf32_string.code_points()), utf32_string.length(), reinterpret_cast<char*>(string.m_value.short_ascii_string.storage));
|
|
VERIFY(result == utf32_string.length());
|
|
|
|
return string;
|
|
}
|
|
|
|
return Utf16String { Detail::Utf16StringData::from_utf32(utf32_string) };
|
|
}
|
|
|
|
Utf16String Utf16String::from_string_builder_without_validation(StringBuilder& builder)
|
|
{
|
|
return Utf16String { Detail::Utf16StringData::from_string_builder(builder) };
|
|
}
|
|
|
|
ErrorOr<void> Formatter<Utf16String>::format(FormatBuilder& builder, Utf16String const& utf16_string)
|
|
{
|
|
if (utf16_string.has_long_utf16_storage())
|
|
return builder.builder().try_append(utf16_string.utf16_view());
|
|
return builder.put_string(utf16_string.ascii_view());
|
|
}
|
|
|
|
}
|