mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-07-24 18:02:20 +00:00
AK: Support UTF-16 string formatting
The underlying storage used during string formatting is StringBuilder. To support UTF-16 strings, this patch allows callers to specify a mode during StringBuilder construction. The default mode is UTF-8, for which StringBuilder remains unchanged. In UTF-16 mode, we treat the StringBuilder's internal ByteBuffer as a series of u16 code units. Appending a single character will append 2 bytes for that character (cast to a char16_t). Appending a StringView will transcode the string to UTF-16. Utf16String also gains the same memory optimization that we added for String, where we hand-off the underlying buffer to Utf16String to avoid having to re-allocate. In the future, we may want to further optimize for ASCII strings. For example, we could defer committing to the u16-esque storage until we see a non-ASCII code point.
This commit is contained in:
parent
fe676585f5
commit
2803d66d87
Notes:
github-actions[bot]
2025-07-18 16:47:24 +00:00
Author: https://github.com/trflynn89
Commit: 2803d66d87
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5388
Reviewed-by: https://github.com/shannonbooth ✅
11 changed files with 362 additions and 55 deletions
|
@ -14,6 +14,8 @@
|
|||
#include <AK/StringData.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/UnicodeUtils.h>
|
||||
#include <AK/Utf16String.h>
|
||||
#include <AK/Utf16StringData.h>
|
||||
#include <AK/Utf16View.h>
|
||||
#include <AK/Utf32View.h>
|
||||
|
||||
|
@ -21,45 +23,70 @@
|
|||
|
||||
namespace AK {
|
||||
|
||||
static constexpr auto STRING_BASE_PREFIX_SIZE = sizeof(Detail::StringData);
|
||||
static constexpr size_t string_builder_prefix_size(StringBuilder::Mode mode)
|
||||
{
|
||||
switch (mode) {
|
||||
case StringBuilder::Mode::UTF8:
|
||||
return sizeof(Detail::StringData);
|
||||
case StringBuilder::Mode::UTF16:
|
||||
return Detail::Utf16StringData::offset_of_string_storage();
|
||||
}
|
||||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
||||
static ErrorOr<StringBuilder::Buffer> create_buffer(size_t capacity)
|
||||
static ErrorOr<StringBuilder::Buffer> create_buffer(StringBuilder::Mode mode, size_t capacity)
|
||||
{
|
||||
StringBuilder::Buffer buffer;
|
||||
auto prefix_size = string_builder_prefix_size(mode);
|
||||
|
||||
if (capacity > StringBuilder::inline_capacity)
|
||||
TRY(buffer.try_ensure_capacity(STRING_BASE_PREFIX_SIZE + capacity));
|
||||
TRY(buffer.try_ensure_capacity(prefix_size + capacity));
|
||||
|
||||
TRY(buffer.try_resize(STRING_BASE_PREFIX_SIZE));
|
||||
TRY(buffer.try_resize(prefix_size));
|
||||
return buffer;
|
||||
}
|
||||
|
||||
ErrorOr<StringBuilder> StringBuilder::create(size_t initial_capacity)
|
||||
{
|
||||
auto buffer = TRY(create_buffer(initial_capacity));
|
||||
return StringBuilder { move(buffer) };
|
||||
auto buffer = TRY(create_buffer(DEFAULT_MODE, initial_capacity));
|
||||
return StringBuilder { move(buffer), DEFAULT_MODE };
|
||||
}
|
||||
|
||||
StringBuilder::StringBuilder()
|
||||
{
|
||||
static_assert(inline_capacity > STRING_BASE_PREFIX_SIZE);
|
||||
m_buffer.resize(STRING_BASE_PREFIX_SIZE);
|
||||
static constexpr auto prefix_size = string_builder_prefix_size(DEFAULT_MODE);
|
||||
static_assert(inline_capacity > prefix_size);
|
||||
|
||||
m_buffer.resize(prefix_size);
|
||||
}
|
||||
|
||||
StringBuilder::StringBuilder(size_t initial_capacity)
|
||||
: m_buffer(MUST(create_buffer(initial_capacity)))
|
||||
: m_buffer(MUST(create_buffer(DEFAULT_MODE, initial_capacity)))
|
||||
{
|
||||
}
|
||||
|
||||
StringBuilder::StringBuilder(Buffer buffer)
|
||||
StringBuilder::StringBuilder(Mode mode)
|
||||
: m_buffer(MUST(create_buffer(mode, inline_capacity)))
|
||||
, m_mode(mode)
|
||||
{
|
||||
}
|
||||
|
||||
StringBuilder::StringBuilder(Mode mode, size_t initial_capacity_in_code_units)
|
||||
: m_buffer(MUST(create_buffer(mode, initial_capacity_in_code_units * (mode == Mode::UTF8 ? 1 : 2))))
|
||||
, m_mode(mode)
|
||||
{
|
||||
}
|
||||
|
||||
StringBuilder::StringBuilder(Buffer buffer, Mode mode)
|
||||
: m_buffer(move(buffer))
|
||||
, m_mode(mode)
|
||||
{
|
||||
}
|
||||
|
||||
inline ErrorOr<void> StringBuilder::will_append(size_t size)
|
||||
inline ErrorOr<void> StringBuilder::will_append(size_t size_in_bytes)
|
||||
{
|
||||
Checked<size_t> needed_capacity = m_buffer.size();
|
||||
needed_capacity += size;
|
||||
needed_capacity += size_in_bytes;
|
||||
VERIFY(!needed_capacity.has_overflow());
|
||||
// Prefer to completely use the existing capacity first
|
||||
if (needed_capacity <= m_buffer.capacity())
|
||||
|
@ -73,7 +100,7 @@ inline ErrorOr<void> StringBuilder::will_append(size_t size)
|
|||
|
||||
size_t StringBuilder::length() const
|
||||
{
|
||||
return m_buffer.size() - STRING_BASE_PREFIX_SIZE;
|
||||
return m_buffer.size() - string_builder_prefix_size(m_mode);
|
||||
}
|
||||
|
||||
bool StringBuilder::is_empty() const
|
||||
|
@ -83,6 +110,9 @@ bool StringBuilder::is_empty() const
|
|||
|
||||
void StringBuilder::trim(size_t count)
|
||||
{
|
||||
if (m_mode == Mode::UTF16)
|
||||
count *= 2;
|
||||
|
||||
auto decrease_count = min(m_buffer.size(), count);
|
||||
m_buffer.resize(m_buffer.size() - decrease_count);
|
||||
}
|
||||
|
@ -91,21 +121,55 @@ ErrorOr<void> StringBuilder::try_append(StringView string)
|
|||
{
|
||||
if (string.is_empty())
|
||||
return {};
|
||||
|
||||
switch (m_mode) {
|
||||
case StringBuilder::Mode::UTF8:
|
||||
TRY(will_append(string.length()));
|
||||
TRY(m_buffer.try_append(string.characters_without_null_termination(), string.length()));
|
||||
break;
|
||||
case StringBuilder::Mode::UTF16:
|
||||
TRY(will_append(string.length() * 2));
|
||||
for (auto code_point : Utf8View { string })
|
||||
TRY(try_append_code_point(code_point));
|
||||
break;
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
ErrorOr<void> StringBuilder::try_append(char ch)
|
||||
{
|
||||
switch (m_mode) {
|
||||
case StringBuilder::Mode::UTF8:
|
||||
TRY(will_append(1));
|
||||
TRY(m_buffer.try_append(ch));
|
||||
break;
|
||||
case StringBuilder::Mode::UTF16:
|
||||
TRY(try_append_code_unit(ch));
|
||||
break;
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
ErrorOr<void> StringBuilder::try_append_code_unit(char16_t ch)
|
||||
{
|
||||
switch (m_mode) {
|
||||
case StringBuilder::Mode::UTF8:
|
||||
TRY(try_append_code_point(ch));
|
||||
break;
|
||||
case StringBuilder::Mode::UTF16:
|
||||
TRY(will_append(2));
|
||||
TRY(m_buffer.try_append(&ch, sizeof(ch)));
|
||||
break;
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
ErrorOr<void> StringBuilder::try_append_repeated(char ch, size_t n)
|
||||
{
|
||||
TRY(will_append(n));
|
||||
TRY(will_append(n * (m_mode == Mode::UTF8 ? 1 : 2)));
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
TRY(try_append(ch));
|
||||
return {};
|
||||
|
@ -115,7 +179,7 @@ ErrorOr<void> StringBuilder::try_append_repeated(StringView string, size_t n)
|
|||
{
|
||||
if (string.is_empty())
|
||||
return {};
|
||||
TRY(will_append(string.length() * n));
|
||||
TRY(will_append(string.length() * (m_mode == Mode::UTF8 ? 1 : 2)));
|
||||
for (size_t i = 0; i < n; ++i)
|
||||
TRY(try_append(string));
|
||||
return {};
|
||||
|
@ -141,6 +205,11 @@ void StringBuilder::append(char ch)
|
|||
MUST(try_append(ch));
|
||||
}
|
||||
|
||||
void StringBuilder::append_code_unit(char16_t ch)
|
||||
{
|
||||
MUST(try_append_code_unit(ch));
|
||||
}
|
||||
|
||||
void StringBuilder::append_repeated(char ch, size_t n)
|
||||
{
|
||||
MUST(try_append_repeated(ch, n));
|
||||
|
@ -158,6 +227,7 @@ ErrorOr<ByteBuffer> StringBuilder::to_byte_buffer() const
|
|||
|
||||
ByteString StringBuilder::to_byte_string() const
|
||||
{
|
||||
VERIFY(m_mode == Mode::UTF8);
|
||||
if (is_empty())
|
||||
return ByteString::empty();
|
||||
return ByteString((char const*)data(), length());
|
||||
|
@ -165,6 +235,7 @@ ByteString StringBuilder::to_byte_string() const
|
|||
|
||||
ErrorOr<String> StringBuilder::to_string()
|
||||
{
|
||||
VERIFY(m_mode == Mode::UTF8);
|
||||
if (m_buffer.is_inline())
|
||||
return String::from_utf8(string_view());
|
||||
return String::from_string_builder({}, *this);
|
||||
|
@ -172,6 +243,7 @@ ErrorOr<String> StringBuilder::to_string()
|
|||
|
||||
String StringBuilder::to_string_without_validation()
|
||||
{
|
||||
VERIFY(m_mode == Mode::UTF8);
|
||||
if (m_buffer.is_inline())
|
||||
return String::from_utf8_without_validation(string_view().bytes());
|
||||
return String::from_string_builder_without_validation({}, *this);
|
||||
|
@ -179,47 +251,108 @@ String StringBuilder::to_string_without_validation()
|
|||
|
||||
FlyString StringBuilder::to_fly_string_without_validation() const
|
||||
{
|
||||
VERIFY(m_mode == Mode::UTF8);
|
||||
return FlyString::from_utf8_without_validation(string_view().bytes());
|
||||
}
|
||||
|
||||
ErrorOr<FlyString> StringBuilder::to_fly_string() const
|
||||
{
|
||||
VERIFY(m_mode == Mode::UTF8);
|
||||
return FlyString::from_utf8(string_view());
|
||||
}
|
||||
|
||||
Utf16String StringBuilder::to_utf16_string()
|
||||
{
|
||||
VERIFY(m_mode == Mode::UTF16);
|
||||
if (m_buffer.is_inline())
|
||||
return Utf16String::from_utf16(utf16_string_view());
|
||||
return Utf16String::from_string_builder({}, *this);
|
||||
}
|
||||
|
||||
Utf16String StringBuilder::to_utf16_string_without_validation()
|
||||
{
|
||||
VERIFY(m_mode == Mode::UTF16);
|
||||
if (m_buffer.is_inline())
|
||||
return Utf16String::from_utf16_without_validation(utf16_string_view());
|
||||
return Utf16String::from_string_builder_without_validation({}, *this);
|
||||
}
|
||||
|
||||
u8* StringBuilder::data()
|
||||
{
|
||||
return m_buffer.data() + STRING_BASE_PREFIX_SIZE;
|
||||
return m_buffer.data() + string_builder_prefix_size(m_mode);
|
||||
}
|
||||
|
||||
u8 const* StringBuilder::data() const
|
||||
{
|
||||
return m_buffer.data() + STRING_BASE_PREFIX_SIZE;
|
||||
return m_buffer.data() + string_builder_prefix_size(m_mode);
|
||||
}
|
||||
|
||||
StringView StringBuilder::string_view() const
|
||||
{
|
||||
return m_buffer.span().slice(STRING_BASE_PREFIX_SIZE);
|
||||
VERIFY(m_mode == Mode::UTF8);
|
||||
return m_buffer.span().slice(string_builder_prefix_size(m_mode));
|
||||
}
|
||||
|
||||
Utf16View StringBuilder::utf16_string_view() const
|
||||
{
|
||||
VERIFY(m_mode == Mode::UTF16);
|
||||
auto view = m_buffer.span().slice(string_builder_prefix_size(m_mode));
|
||||
|
||||
return { reinterpret_cast<char16_t const*>(view.data()), view.size() / 2 };
|
||||
}
|
||||
|
||||
void StringBuilder::clear()
|
||||
{
|
||||
m_buffer.resize(STRING_BASE_PREFIX_SIZE);
|
||||
m_buffer.resize(string_builder_prefix_size(m_mode));
|
||||
}
|
||||
|
||||
ErrorOr<void> StringBuilder::try_append_code_point(u32 code_point)
|
||||
{
|
||||
auto nwritten = TRY(AK::UnicodeUtils::try_code_point_to_utf8(code_point, [this](char c) { return try_append(c); }));
|
||||
if (nwritten < 0) {
|
||||
TRY(try_append(0xef));
|
||||
TRY(try_append(0xbf));
|
||||
TRY(try_append(0xbd));
|
||||
if (!is_unicode(code_point)) {
|
||||
TRY(try_append_code_point(UnicodeUtils::REPLACEMENT_CODE_POINT));
|
||||
return {};
|
||||
}
|
||||
|
||||
switch (m_mode) {
|
||||
case Mode::UTF8:
|
||||
TRY(AK::UnicodeUtils::try_code_point_to_utf8(code_point, [this](char c) { return try_append(c); }));
|
||||
break;
|
||||
case Mode::UTF16:
|
||||
TRY(AK::UnicodeUtils::try_code_point_to_utf16(code_point, [this](char16_t c) { return m_buffer.try_append(&c, sizeof(c)); }));
|
||||
break;
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
void StringBuilder::append_code_point(u32 code_point)
|
||||
{
|
||||
if (!is_unicode(code_point)) {
|
||||
append_code_point(UnicodeUtils::REPLACEMENT_CODE_POINT);
|
||||
return;
|
||||
}
|
||||
|
||||
if (m_mode == Mode::UTF16) {
|
||||
(void)(will_append(2));
|
||||
|
||||
if (code_point <= UnicodeUtils::FIRST_SUPPLEMENTARY_PLANE_CODE_POINT) {
|
||||
auto code_unit = static_cast<char16_t>(code_point);
|
||||
m_buffer.append(&code_unit, sizeof(code_unit));
|
||||
return;
|
||||
}
|
||||
|
||||
(void)(will_append(2));
|
||||
code_point -= UnicodeUtils::FIRST_SUPPLEMENTARY_PLANE_CODE_POINT;
|
||||
|
||||
auto code_unit = static_cast<u16>(UnicodeUtils::HIGH_SURROGATE_MIN | (code_point >> 10));
|
||||
m_buffer.append(&code_unit, sizeof(code_unit));
|
||||
|
||||
code_unit = static_cast<u16>(UnicodeUtils::LOW_SURROGATE_MIN | (code_point & 0x3ff));
|
||||
m_buffer.append(&code_unit, sizeof(code_unit));
|
||||
|
||||
return;
|
||||
}
|
||||
|
||||
if (code_point <= 0x7f) {
|
||||
m_buffer.append(static_cast<char>(code_point));
|
||||
} else if (code_point <= 0x07ff) {
|
||||
|
@ -231,17 +364,12 @@ void StringBuilder::append_code_point(u32 code_point)
|
|||
m_buffer.append(static_cast<char>((((code_point >> 12) & 0x0f) | 0xe0)));
|
||||
m_buffer.append(static_cast<char>((((code_point >> 6) & 0x3f) | 0x80)));
|
||||
m_buffer.append(static_cast<char>((((code_point >> 0) & 0x3f) | 0x80)));
|
||||
} else if (code_point <= 0x10ffff) {
|
||||
} else {
|
||||
(void)will_append(4);
|
||||
m_buffer.append(static_cast<char>((((code_point >> 18) & 0x07) | 0xf0)));
|
||||
m_buffer.append(static_cast<char>((((code_point >> 12) & 0x3f) | 0x80)));
|
||||
m_buffer.append(static_cast<char>((((code_point >> 6) & 0x3f) | 0x80)));
|
||||
m_buffer.append(static_cast<char>((((code_point >> 0) & 0x3f) | 0x80)));
|
||||
} else {
|
||||
(void)will_append(3);
|
||||
m_buffer.append(0xef);
|
||||
m_buffer.append(0xbf);
|
||||
m_buffer.append(0xbd);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -252,6 +380,15 @@ ErrorOr<void> StringBuilder::try_append(Utf16View const& utf16_view)
|
|||
if (utf16_view.has_ascii_storage())
|
||||
return try_append(utf16_view.bytes());
|
||||
|
||||
if (m_mode == Mode::UTF16) {
|
||||
TRY(will_append(utf16_view.length_in_code_units() * 2));
|
||||
|
||||
for (size_t i = 0; i < utf16_view.length_in_code_units(); ++i)
|
||||
TRY(try_append_code_unit(utf16_view.code_unit_at(i)));
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
auto remaining_view = utf16_view.utf16_span();
|
||||
auto maximum_utf8_length = UnicodeUtils::maximum_utf8_length_from_utf16(remaining_view);
|
||||
|
||||
|
@ -356,7 +493,7 @@ ErrorOr<void> StringBuilder::try_append_escaped_for_json(StringView string)
|
|||
return {};
|
||||
}
|
||||
|
||||
auto StringBuilder::leak_buffer_for_string_construction(Badge<Detail::StringData>) -> Optional<Buffer::OutlineBuffer>
|
||||
auto StringBuilder::leak_buffer_for_string_construction() -> Optional<Buffer::OutlineBuffer>
|
||||
{
|
||||
if (auto buffer = m_buffer.leak_outline_buffer({}); buffer.has_value()) {
|
||||
clear();
|
||||
|
|
|
@ -16,6 +16,12 @@ namespace AK {
|
|||
|
||||
class StringBuilder {
|
||||
public:
|
||||
enum class Mode {
|
||||
UTF8,
|
||||
UTF16,
|
||||
};
|
||||
|
||||
static constexpr auto DEFAULT_MODE = Mode::UTF8;
|
||||
static constexpr size_t inline_capacity = 256;
|
||||
|
||||
using Buffer = Detail::ByteBuffer<inline_capacity>;
|
||||
|
@ -24,36 +30,42 @@ public:
|
|||
|
||||
StringBuilder();
|
||||
explicit StringBuilder(size_t initial_capacity);
|
||||
|
||||
explicit StringBuilder(Mode);
|
||||
StringBuilder(Mode, size_t initial_capacity_in_code_units);
|
||||
|
||||
~StringBuilder() = default;
|
||||
|
||||
ErrorOr<void> try_append(StringView);
|
||||
ErrorOr<void> try_append(Utf16View const&);
|
||||
ErrorOr<void> try_append(Utf32View const&);
|
||||
ErrorOr<void> try_append_code_point(u32);
|
||||
ErrorOr<void> try_append(char);
|
||||
ErrorOr<void> try_append_code_unit(char16_t);
|
||||
ErrorOr<void> try_append_code_point(u32);
|
||||
ErrorOr<void> try_append(char const*, size_t);
|
||||
ErrorOr<void> try_append_repeated(char, size_t);
|
||||
ErrorOr<void> try_append_repeated(StringView, size_t);
|
||||
ErrorOr<void> try_append_escaped_for_json(StringView);
|
||||
|
||||
template<typename... Parameters>
|
||||
ErrorOr<void> try_appendff(CheckedFormatString<Parameters...>&& fmtstr, Parameters const&... parameters)
|
||||
{
|
||||
VariadicFormatParams<AllowDebugOnlyFormatters::No, Parameters...> variadic_format_params { parameters... };
|
||||
return vformat(*this, fmtstr.view(), variadic_format_params);
|
||||
}
|
||||
ErrorOr<void> try_append(char const*, size_t);
|
||||
ErrorOr<void> try_append_repeated(char, size_t);
|
||||
ErrorOr<void> try_append_repeated(StringView, size_t);
|
||||
ErrorOr<void> try_append_escaped_for_json(StringView);
|
||||
|
||||
void append(StringView);
|
||||
void append(Utf16View const&);
|
||||
void append(Utf32View const&);
|
||||
void append(char);
|
||||
void append_code_unit(char16_t);
|
||||
void append_code_point(u32);
|
||||
void append(char const*, size_t);
|
||||
void appendvf(char const*, va_list);
|
||||
void append_repeated(char, size_t);
|
||||
void append_repeated(StringView, size_t);
|
||||
|
||||
void append_as_lowercase(char);
|
||||
void append_escaped_for_json(StringView);
|
||||
void append_as_lowercase(char);
|
||||
|
||||
template<typename... Parameters>
|
||||
void appendff(CheckedFormatString<Parameters...>&& fmtstr, Parameters const&... parameters)
|
||||
|
@ -70,9 +82,13 @@ public:
|
|||
[[nodiscard]] FlyString to_fly_string_without_validation() const;
|
||||
ErrorOr<FlyString> to_fly_string() const;
|
||||
|
||||
Utf16String to_utf16_string();
|
||||
Utf16String to_utf16_string_without_validation();
|
||||
|
||||
[[nodiscard]] ErrorOr<ByteBuffer> to_byte_buffer() const;
|
||||
|
||||
[[nodiscard]] StringView string_view() const;
|
||||
[[nodiscard]] Utf16View utf16_string_view() const;
|
||||
void clear();
|
||||
|
||||
[[nodiscard]] size_t length() const;
|
||||
|
@ -98,16 +114,20 @@ public:
|
|||
return {};
|
||||
}
|
||||
|
||||
Optional<Buffer::OutlineBuffer> leak_buffer_for_string_construction(Badge<Detail::StringData>);
|
||||
Optional<Buffer::OutlineBuffer> leak_buffer_for_string_construction(Badge<Detail::StringData>) { return leak_buffer_for_string_construction(); }
|
||||
Optional<Buffer::OutlineBuffer> leak_buffer_for_string_construction(Badge<Detail::Utf16StringData>) { return leak_buffer_for_string_construction(); }
|
||||
|
||||
private:
|
||||
explicit StringBuilder(Buffer);
|
||||
StringBuilder(Buffer, Mode);
|
||||
|
||||
Optional<Buffer::OutlineBuffer> leak_buffer_for_string_construction();
|
||||
|
||||
ErrorOr<void> will_append(size_t);
|
||||
u8* data();
|
||||
u8 const* data() const;
|
||||
|
||||
Buffer m_buffer;
|
||||
Mode m_mode { DEFAULT_MODE };
|
||||
};
|
||||
|
||||
}
|
||||
|
|
|
@ -40,7 +40,7 @@ public:
|
|||
auto byte_count = builder.length();
|
||||
VERIFY(byte_count > MAX_SHORT_STRING_BYTE_COUNT);
|
||||
|
||||
auto buffer = builder.leak_buffer_for_string_construction({});
|
||||
auto buffer = builder.leak_buffer_for_string_construction(Badge<StringData> {});
|
||||
VERIFY(buffer.has_value()); // We should only arrive here if the buffer is outlined.
|
||||
|
||||
return adopt_ref(*new (buffer->buffer.data()) StringData(byte_count));
|
||||
|
|
|
@ -63,6 +63,11 @@ Utf16String Utf16String::from_utf32(Utf32View const& utf32_string)
|
|||
return Utf16String { Detail::Utf16StringData::from_utf32(utf32_string) };
|
||||
}
|
||||
|
||||
Utf16String Utf16String::from_string_builder_without_validation(StringBuilder& builder)
|
||||
{
|
||||
return Utf16String { Detail::Utf16StringData::from_string_builder(builder) };
|
||||
}
|
||||
|
||||
ErrorOr<void> Formatter<Utf16String>::format(FormatBuilder& builder, Utf16String const& utf16_string)
|
||||
{
|
||||
if (utf16_string.has_long_utf16_storage())
|
||||
|
|
|
@ -81,11 +81,50 @@ public:
|
|||
requires(IsOneOf<RemoveCVReference<T>, Utf16String>)
|
||||
static Utf16String from_utf16_without_validation(T&&) = delete;
|
||||
|
||||
template<typename... Parameters>
|
||||
ALWAYS_INLINE static Utf16String formatted(CheckedFormatString<Parameters...>&& format, Parameters const&... parameters)
|
||||
{
|
||||
StringBuilder builder(StringBuilder::Mode::UTF16);
|
||||
|
||||
VariadicFormatParams<AllowDebugOnlyFormatters::No, Parameters...> variadic_format_parameters { parameters... };
|
||||
MUST(vformat(builder, format.view(), variadic_format_parameters));
|
||||
|
||||
return builder.to_utf16_string();
|
||||
}
|
||||
|
||||
template<Arithmetic T>
|
||||
ALWAYS_INLINE static Utf16String number(T value)
|
||||
{
|
||||
return formatted("{}", value);
|
||||
}
|
||||
|
||||
template<class SeparatorType, class CollectionType>
|
||||
ALWAYS_INLINE static Utf16String join(SeparatorType const& separator, CollectionType const& collection, StringView format = "{}"sv)
|
||||
{
|
||||
StringBuilder builder(StringBuilder::Mode::UTF16);
|
||||
builder.join(separator, collection, format);
|
||||
|
||||
return builder.to_utf16_string();
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static Utf16String from_string_builder(Badge<StringBuilder>, StringBuilder& builder)
|
||||
{
|
||||
VERIFY(builder.utf16_string_view().validate());
|
||||
return from_string_builder_without_validation(builder);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static Utf16String from_string_builder_without_validation(Badge<StringBuilder>, StringBuilder& builder)
|
||||
{
|
||||
return from_string_builder_without_validation(builder);
|
||||
}
|
||||
|
||||
private:
|
||||
ALWAYS_INLINE explicit Utf16String(NonnullRefPtr<Detail::Utf16StringData const> value)
|
||||
: Utf16StringBase(move(value))
|
||||
{
|
||||
}
|
||||
|
||||
static Utf16String from_string_builder_without_validation(StringBuilder&);
|
||||
};
|
||||
|
||||
template<>
|
||||
|
|
|
@ -132,6 +132,32 @@ NonnullRefPtr<Utf16StringData> Utf16StringData::from_utf32(Utf32View const& utf3
|
|||
return string.release_nonnull();
|
||||
}
|
||||
|
||||
NonnullRefPtr<Utf16StringData> Utf16StringData::from_string_builder(StringBuilder& builder)
|
||||
{
|
||||
auto code_unit_length = builder.utf16_string_view().length_in_code_units();
|
||||
|
||||
// Due to internal optimizations, we have an explicit maximum string length of 2**63 - 1.
|
||||
VERIFY(code_unit_length >> Detail::UTF16_FLAG == 0);
|
||||
|
||||
auto buffer = builder.leak_buffer_for_string_construction(Badge<Utf16StringData> {});
|
||||
VERIFY(buffer.has_value()); // We should only arrive here if the buffer is outlined.
|
||||
|
||||
auto data = buffer->buffer.slice(offset_of_string_storage(), code_unit_length * 2);
|
||||
|
||||
Utf16View view { reinterpret_cast<char16_t const*>(data.data()), data.size() / sizeof(char16_t) };
|
||||
auto storage_type = view.is_ascii() ? StorageType::ASCII : StorageType::UTF16;
|
||||
|
||||
// FIXME: To reduce memory consumption, it would be better for StringBuilder to handle ASCII vs. UTF-16 storage. For
|
||||
// example, it might store its buffer as ASCII until it comes across a non-ASCII code point, then switch to
|
||||
// UTF-16. For now, we switch to ASCII here since third-party APIs will often want ASCII text.
|
||||
if (storage_type == StorageType::ASCII) {
|
||||
for (size_t i = 0; i < code_unit_length; ++i)
|
||||
data[i] = static_cast<u8>(view.code_unit_at(i));
|
||||
}
|
||||
|
||||
return adopt_ref(*new (buffer->buffer.data()) Utf16StringData { storage_type, code_unit_length });
|
||||
}
|
||||
|
||||
size_t Utf16StringData::calculate_code_point_length() const
|
||||
{
|
||||
ASSERT(!has_ascii_storage());
|
||||
|
|
|
@ -31,9 +31,15 @@ public:
|
|||
static NonnullRefPtr<Utf16StringData> from_utf8(StringView, AllowASCIIStorage);
|
||||
static NonnullRefPtr<Utf16StringData> from_utf16(Utf16View const&);
|
||||
static NonnullRefPtr<Utf16StringData> from_utf32(Utf32View const&);
|
||||
static NonnullRefPtr<Utf16StringData> from_string_builder(StringBuilder&);
|
||||
|
||||
~Utf16StringData() = default;
|
||||
|
||||
[[nodiscard]] static constexpr size_t offset_of_string_storage()
|
||||
{
|
||||
return offsetof(Utf16StringData, m_ascii_data);
|
||||
}
|
||||
|
||||
void operator delete(void* ptr)
|
||||
{
|
||||
free(ptr);
|
||||
|
|
|
@ -1280,7 +1280,7 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
|
|||
VERIFY(position <= string_length);
|
||||
|
||||
// 3. Let result be the empty String.
|
||||
Utf16Data result;
|
||||
StringBuilder result(StringBuilder::Mode::UTF16);
|
||||
|
||||
// 4. Let templateRemainder be replacementTemplate.
|
||||
auto replace_template_string = TRY(replacement_template.to_utf16_string(vm));
|
||||
|
@ -1451,7 +1451,7 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
|
|||
auto ref_length = ref.length_in_code_units();
|
||||
|
||||
// k. Set result to the string-concatenation of result and refReplacement.
|
||||
result.append(ref_replacement.utf16_span().data(), ref_replacement.length_in_code_units());
|
||||
result.append(ref_replacement);
|
||||
|
||||
// j. Set templateRemainder to the substring of templateRemainder from refLength.
|
||||
// NOTE: We do this step last because refReplacement may point to templateRemainder.
|
||||
|
@ -1459,7 +1459,7 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
|
|||
}
|
||||
|
||||
// 6. Return result.
|
||||
return MUST(Utf16View { result }.to_utf8());
|
||||
return MUST(result.utf16_string_view().to_utf8());
|
||||
}
|
||||
|
||||
void DisposeCapability::visit_edges(GC::Cell::Visitor& visitor) const
|
||||
|
|
|
@ -87,12 +87,11 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
|
|||
auto inserted_data_result = MUST(AK::utf8_to_utf16(data));
|
||||
auto after_data = utf16_view.substring_view(offset + count);
|
||||
|
||||
Utf16Data full_data;
|
||||
full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units());
|
||||
full_data.append(before_data.utf16_span().data(), before_data.length_in_code_units());
|
||||
full_data.extend(inserted_data_result.data);
|
||||
full_data.append(after_data.utf16_span().data(), after_data.length_in_code_units());
|
||||
Utf16View full_view { full_data };
|
||||
StringBuilder full_data(StringBuilder::Mode::UTF16, before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units());
|
||||
full_data.append(before_data);
|
||||
full_data.append(inserted_data_result.data);
|
||||
full_data.append(after_data);
|
||||
auto full_view = full_data.utf16_string_view();
|
||||
|
||||
bool characters_are_the_same = utf16_view == full_view;
|
||||
auto old_data = m_data;
|
||||
|
|
|
@ -235,6 +235,80 @@ TEST_CASE(from_utf32)
|
|||
}
|
||||
}
|
||||
|
||||
TEST_CASE(formatted)
|
||||
{
|
||||
{
|
||||
auto string = Utf16String::formatted("{}", 42);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 2uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 2uz);
|
||||
EXPECT_EQ(string, u"42"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::number(42);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 2uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 2uz);
|
||||
EXPECT_EQ(string, u"42"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::formatted("whf {} {} {}!", "😀"sv, Utf16View { u"🍕"sv }, 3.14);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(!string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 15uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 13uz);
|
||||
EXPECT_EQ(string, u"whf 😀 🍕 3.14!"sv);
|
||||
}
|
||||
{
|
||||
Array segments {
|
||||
u"abcdefghijklmnopqrstuvwxyz"sv,
|
||||
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
|
||||
u"abcdefghijklmnopqrstuvwxyz"sv,
|
||||
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
|
||||
u"abcdefghijklmnopqrstuvwxyz"sv,
|
||||
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
|
||||
};
|
||||
|
||||
auto string = Utf16String::join(u"--"sv, segments);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(string.is_ascii());
|
||||
EXPECT(string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 166uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 166uz);
|
||||
EXPECT_EQ(string, u"abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ--abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ--abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv);
|
||||
}
|
||||
{
|
||||
Array segments {
|
||||
u"abcdefghijklmnopqrstuvwxyz"sv,
|
||||
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
|
||||
u"\xd83d\xde00"sv,
|
||||
u"abcdefghijklmnopqrstuvwxyz"sv,
|
||||
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
|
||||
u"🍕"sv,
|
||||
u"abcdefghijklmnopqrstuvwxyz"sv,
|
||||
u"ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv,
|
||||
};
|
||||
|
||||
auto string = Utf16String::join(u"--"sv, segments);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(!string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 174uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 172uz);
|
||||
EXPECT_EQ(string, u"abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ--😀--abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ--🍕--abcdefghijklmnopqrstuvwxyz--ABCDEFGHIJKLMNOPQRSTUVWXYZ"sv);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE(copy_operations)
|
||||
{
|
||||
auto test = [](Utf16String const& string1) {
|
||||
|
|
|
@ -7,10 +7,11 @@ get_property(CLANG_PLUGINS_COMPILE_OPTIONS_FOR_TESTS GLOBAL PROPERTY CLANG_PLUGI
|
|||
|
||||
list(APPEND CLANG_PLUGINS_COMPILE_OPTIONS_FOR_TESTS
|
||||
-std=c++23
|
||||
-Wno-user-defined-literals
|
||||
-Wno-invalid-offsetof
|
||||
-Wno-literal-range
|
||||
-Wno-unknown-warning-option
|
||||
-Wno-unqualified-std-cast-call
|
||||
-Wno-user-defined-literals
|
||||
)
|
||||
|
||||
# Ensure we always check for invalid function field types regardless of the value of ENABLE_CLANG_PLUGINS_INVALID_FUNCTION_MEMBERS
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue