AK: Add a UTF-16 string with optimized short- and ASCII-string storage

This is a strictly UTF-16 string with some optimizations for ASCII.

* If created from a short UTF-8 or UTF-16 string that is also ASCII,
  then the string is stored in an inlined byte buffer.

* If created with a long UTF-8 or UTF-16 string that is also ASCII,
  then the string is stored in an outlined char buffer.

* If created with a short or long UTF-8 or UTF-16 string that is not
  ASCII, then the string is stored in an outlined char16 buffer.

We do not store short non-ASCII text in the inlined buffer to avoid
confusion with operations such as `length_in_code_units` and
`code_unit_at`. For example, "😀" would be stored as 4 UTF-8 bytes
in short string form. But we still want `length_in_code_units` to
be 2, and `code_unit_at(0)` to be 0xD83D.
This commit is contained in:
Timothy Flynn 2025-06-12 19:29:41 -04:00 committed by Tim Flynn
commit fe676585f5
Notes: github-actions[bot] 2025-07-18 16:47:31 +00:00
17 changed files with 1527 additions and 44 deletions

View file

@ -29,6 +29,8 @@ set(SOURCES
StringUtils.cpp
StringView.cpp
Time.cpp
Utf16String.cpp
Utf16StringData.cpp
Utf16View.cpp
Utf32View.cpp
Utf8View.cpp

View file

@ -19,6 +19,7 @@ template<size_t inline_capacity>
class ByteBuffer;
class StringData;
class Utf16StringData;
}
@ -52,6 +53,7 @@ class String;
class StringBuilder;
class StringView;
class UnixDateTime;
class Utf16String;
class Utf16View;
class Utf32CodePointIterator;
class Utf32View;
@ -198,6 +200,7 @@ using AK::StringView;
using AK::TrailingCodePointTransformation;
using AK::Traits;
using AK::UnixDateTime;
using AK::Utf16String;
using AK::Utf16View;
using AK::Utf32CodePointIterator;
using AK::Utf32View;

View file

@ -249,8 +249,10 @@ ErrorOr<void> StringBuilder::try_append(Utf16View const& utf16_view)
{
if (utf16_view.is_empty())
return {};
if (utf16_view.has_ascii_storage())
return try_append(utf16_view.bytes());
auto remaining_view = utf16_view.span();
auto remaining_view = utf16_view.utf16_span();
auto maximum_utf8_length = UnicodeUtils::maximum_utf8_length_from_utf16(remaining_view);
// Possibly over-allocate a little to ensure we don't have to allocate later.

73
AK/Utf16String.cpp Normal file
View file

@ -0,0 +1,73 @@
/*
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/Utf16String.h>
#include <AK/Utf32View.h>
#include <simdutf.h>
namespace AK {
static_assert(sizeof(Detail::ShortString) == sizeof(Detail::Utf16StringData*));
Utf16String Utf16String::from_utf8_without_validation(StringView utf8_string)
{
if (utf8_string.length() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf8_string.is_ascii()) {
Utf16String string;
string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(utf8_string.length());
auto result = utf8_string.bytes().copy_to(string.m_value.short_ascii_string.storage);
VERIFY(result == utf8_string.length());
return string;
}
return Utf16String { Detail::Utf16StringData::from_utf8(utf8_string, Detail::Utf16StringData::AllowASCIIStorage::Yes) };
}
Utf16String Utf16String::from_utf16_without_validation(Utf16View const& utf16_string)
{
if (utf16_string.length_in_code_units() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf16_string.is_ascii()) {
Utf16String string;
string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(utf16_string.length_in_code_units());
if (utf16_string.has_ascii_storage()) {
auto result = utf16_string.bytes().copy_to(string.m_value.short_ascii_string.storage);
VERIFY(result == utf16_string.length_in_code_units());
} else {
auto result = simdutf::convert_utf16_to_utf8(utf16_string.utf16_span().data(), utf16_string.length_in_code_units(), reinterpret_cast<char*>(string.m_value.short_ascii_string.storage));
VERIFY(result == utf16_string.length_in_code_units());
}
return string;
}
return Utf16String { Detail::Utf16StringData::from_utf16(utf16_string) };
}
Utf16String Utf16String::from_utf32(Utf32View const& utf32_string)
{
if (utf32_string.length() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf32_string.is_ascii()) {
Utf16String string;
string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(utf32_string.length());
auto result = simdutf::convert_utf32_to_utf8(reinterpret_cast<char32_t const*>(utf32_string.code_points()), utf32_string.length(), reinterpret_cast<char*>(string.m_value.short_ascii_string.storage));
VERIFY(result == utf32_string.length());
return string;
}
return Utf16String { Detail::Utf16StringData::from_utf32(utf32_string) };
}
ErrorOr<void> Formatter<Utf16String>::format(FormatBuilder& builder, Utf16String const& utf16_string)
{
if (utf16_string.has_long_utf16_storage())
return builder.builder().try_append(utf16_string.utf16_view());
return builder.put_string(utf16_string.ascii_view());
}
}

117
AK/Utf16String.h Normal file
View file

@ -0,0 +1,117 @@
/*
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Badge.h>
#include <AK/Error.h>
#include <AK/Format.h>
#include <AK/NonnullRefPtr.h>
#include <AK/String.h>
#include <AK/StringView.h>
#include <AK/Traits.h>
#include <AK/UnicodeUtils.h>
#include <AK/Utf16StringBase.h>
#include <AK/Utf16StringData.h>
#include <AK/Utf16View.h>
#include <AK/Utf8View.h>
namespace AK {
// Utf16String is a strongly owned sequence of Unicode code points encoded as UTF-16.
//
// The data may or may not be heap-allocated, and may or may not be reference counted. As a memory optimization, if the
// UTF-16 string is entirely ASCII, the string is stored as 8-bit bytes.
class [[nodiscard]] Utf16String : public Detail::Utf16StringBase {
public:
using Utf16StringBase::Utf16StringBase;
explicit constexpr Utf16String(Utf16StringBase&& base)
: Utf16StringBase(move(base))
{
}
ALWAYS_INLINE static Utf16String from_utf8(StringView utf8_string)
{
VERIFY(Utf8View { utf8_string }.validate());
return from_utf8_without_validation(utf8_string);
}
ALWAYS_INLINE static Utf16String from_utf8(String const& utf8_string)
{
return from_utf8_without_validation(utf8_string);
}
ALWAYS_INLINE static ErrorOr<Utf16String> try_from_utf8(StringView utf8_string)
{
if (!Utf8View { utf8_string }.validate())
return Error::from_string_literal("Input was not valid UTF-8");
return from_utf8_without_validation(utf8_string);
}
ALWAYS_INLINE static Utf16String from_utf16(Utf16View const& utf16_string)
{
VERIFY(utf16_string.validate());
return from_utf16_without_validation(utf16_string);
}
ALWAYS_INLINE static ErrorOr<Utf16String> try_from_utf16(Utf16View const& utf16_string)
{
if (!utf16_string.validate())
return Error::from_string_literal("Input was not valid UTF-16");
return from_utf16_without_validation(utf16_string);
}
static Utf16String from_utf8_without_validation(StringView);
static Utf16String from_utf16_without_validation(Utf16View const&);
static Utf16String from_utf32(Utf32View const&);
template<typename T>
requires(IsOneOf<RemoveCVReference<T>, Utf16String>)
static Utf16String from_utf16(T&&) = delete;
template<typename T>
requires(IsOneOf<RemoveCVReference<T>, Utf16String>)
static ErrorOr<Utf16String> try_from_utf16(T&&) = delete;
template<typename T>
requires(IsOneOf<RemoveCVReference<T>, Utf16String>)
static Utf16String from_utf16_without_validation(T&&) = delete;
private:
ALWAYS_INLINE explicit Utf16String(NonnullRefPtr<Detail::Utf16StringData const> value)
: Utf16StringBase(move(value))
{
}
};
template<>
struct Formatter<Utf16String> : Formatter<FormatString> {
ErrorOr<void> format(FormatBuilder&, Utf16String const&);
};
template<>
struct Traits<Utf16String> : public DefaultTraits<Utf16String> {
static unsigned hash(Utf16String const& s) { return s.hash(); }
};
}
[[nodiscard]] ALWAYS_INLINE AK::Utf16String operator""_utf16(char const* string, size_t length)
{
AK::StringView view { string, length };
ASSERT(AK::Utf8View { view }.validate());
return AK::Utf16String::from_utf8_without_validation(view);
}
[[nodiscard]] ALWAYS_INLINE AK::Utf16String operator""_utf16(char16_t const* string, size_t length)
{
AK::Utf16View view { string, length };
ASSERT(view.validate());
return AK::Utf16String::from_utf16_without_validation(view);
}

268
AK/Utf16StringBase.h Normal file
View file

@ -0,0 +1,268 @@
/*
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/CharacterTypes.h>
#include <AK/NonnullRefPtr.h>
#include <AK/StringBase.h>
#include <AK/StringView.h>
#include <AK/Utf16StringData.h>
#include <AK/Utf16View.h>
namespace AK::Detail {
class Utf16StringBase {
public:
constexpr Utf16StringBase()
: Utf16StringBase(ShortString::create_empty())
{
}
explicit constexpr Utf16StringBase(ShortString short_string)
: m_value { .short_ascii_string = short_string }
{
}
ALWAYS_INLINE explicit Utf16StringBase(NonnullRefPtr<Utf16StringData const> value)
: m_value { .data = &value.leak_ref() }
{
}
ALWAYS_INLINE Utf16StringBase(Utf16StringBase const& other)
: m_value(other.m_value)
{
if (has_long_storage())
data_without_union_member_assertion()->ref();
}
constexpr Utf16StringBase(Utf16StringBase&& other)
: m_value(other.m_value)
{
other.m_value = { .short_ascii_string = ShortString::create_empty() };
}
constexpr ~Utf16StringBase()
{
if (!is_constant_evaluated())
destroy_string();
}
ALWAYS_INLINE operator Utf16View() const& { return utf16_view(); }
explicit operator Utf16View() const&& = delete;
[[nodiscard]] ALWAYS_INLINE String to_utf8(AllowLonelySurrogates allow_lonely_surrogates = AllowLonelySurrogates::Yes) const
{
return MUST(utf16_view().to_utf8(allow_lonely_surrogates));
}
[[nodiscard]] ALWAYS_INLINE String to_utf8_but_should_be_ported_to_utf16(AllowLonelySurrogates allow_lonely_surrogates = AllowLonelySurrogates::Yes) const
{
return to_utf8(allow_lonely_surrogates);
}
[[nodiscard]] ALWAYS_INLINE ByteString to_byte_string(AllowLonelySurrogates allow_lonely_surrogates = AllowLonelySurrogates::Yes) const
{
return MUST(utf16_view().to_byte_string(allow_lonely_surrogates));
}
[[nodiscard]] ALWAYS_INLINE StringView ascii_view() const&
{
if (has_short_ascii_storage())
return short_ascii_string_without_union_member_assertion().bytes();
VERIFY(has_long_ascii_storage());
return data_without_union_member_assertion()->ascii_view();
}
[[nodiscard]] ALWAYS_INLINE Utf16View utf16_view() const&
{
if (has_short_ascii_storage())
return Utf16View { ascii_view().characters_without_null_termination(), length_in_code_units() };
return data_without_union_member_assertion()->utf16_view();
}
StringView ascii_view() const&& = delete;
Utf16View utf16_view() const&& = delete;
ALWAYS_INLINE Utf16StringBase& operator=(Utf16StringBase const& other)
{
if (&other != this) {
if (has_long_storage())
data_without_union_member_assertion()->unref();
m_value = other.m_value;
if (has_long_storage())
data_without_union_member_assertion()->ref();
}
return *this;
}
ALWAYS_INLINE Utf16StringBase& operator=(Utf16StringBase&& other)
{
if (has_long_storage())
data_without_union_member_assertion()->unref();
m_value = exchange(other.m_value, { .short_ascii_string = ShortString::create_empty() });
return *this;
}
[[nodiscard]] ALWAYS_INLINE bool operator==(Utf16StringBase const& other) const
{
if (has_short_ascii_storage() && other.has_short_ascii_storage())
return bit_cast<FlatPtr>(m_value) == bit_cast<FlatPtr>(other.m_value);
if (has_long_storage() && other.has_long_storage())
return *data_without_union_member_assertion() == *other.data_without_union_member_assertion();
return utf16_view() == other.utf16_view();
}
[[nodiscard]] ALWAYS_INLINE bool operator==(Utf16View const& other) const { return utf16_view() == other; }
[[nodiscard]] ALWAYS_INLINE bool operator==(StringView other) const { return utf16_view() == other; }
[[nodiscard]] ALWAYS_INLINE bool equals_ignoring_ascii_case(Utf16View const& other) const { return utf16_view().equals_ignoring_ascii_case(other); }
[[nodiscard]] ALWAYS_INLINE bool equals_ignoring_ascii_case(Utf16StringBase const& other) const { return utf16_view().equals_ignoring_ascii_case(other.utf16_view()); }
template<typename... Ts>
[[nodiscard]] ALWAYS_INLINE bool is_one_of(Ts&&... strings) const
{
return (this->operator==(forward<Ts>(strings)) || ...);
}
template<typename... Ts>
[[nodiscard]] ALWAYS_INLINE bool is_one_of_ignoring_ascii_case(Ts&&... strings) const
{
return (this->equals_ignoring_ascii_case(forward<Ts>(strings)) || ...);
}
[[nodiscard]] ALWAYS_INLINE u32 hash() const
{
if (has_short_ascii_storage())
return StringView { short_ascii_string_without_union_member_assertion().bytes() }.hash();
return data_without_union_member_assertion()->hash();
}
[[nodiscard]] ALWAYS_INLINE bool is_empty() const { return length_in_code_units() == 0uz; }
[[nodiscard]] ALWAYS_INLINE bool is_ascii() const { return utf16_view().is_ascii(); }
[[nodiscard]] ALWAYS_INLINE size_t length_in_code_units() const
{
if (has_short_ascii_storage())
return short_ascii_string_without_union_member_assertion().byte_count();
return data_without_union_member_assertion()->length_in_code_units();
}
[[nodiscard]] ALWAYS_INLINE size_t length_in_code_points() const
{
if (has_short_ascii_storage())
return short_ascii_string_without_union_member_assertion().byte_count();
return data_without_union_member_assertion()->length_in_code_points();
}
[[nodiscard]] ALWAYS_INLINE char16_t code_unit_at(size_t code_unit_offset) const { return utf16_view().code_unit_at(code_unit_offset); }
[[nodiscard]] ALWAYS_INLINE u32 code_point_at(size_t code_unit_offset) const { return utf16_view().code_point_at(code_unit_offset); }
[[nodiscard]] ALWAYS_INLINE size_t code_unit_offset_of(size_t code_point_offset) const
{
if (has_ascii_storage())
return code_point_offset;
return utf16_view().code_unit_offset_of(code_point_offset);
}
[[nodiscard]] ALWAYS_INLINE size_t code_point_offset_of(size_t code_unit_offset) const
{
if (has_ascii_storage())
return code_unit_offset;
return utf16_view().code_point_offset_of(code_unit_offset);
}
[[nodiscard]] ALWAYS_INLINE Utf16CodePointIterator begin() const { return utf16_view().begin(); }
[[nodiscard]] ALWAYS_INLINE Utf16CodePointIterator end() const { return utf16_view().end(); }
[[nodiscard]] ALWAYS_INLINE Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const
{
return utf16_view().substring_view(code_unit_offset, code_unit_length);
}
[[nodiscard]] ALWAYS_INLINE Utf16View substring_view(size_t code_unit_offset) const
{
return utf16_view().substring_view(code_unit_offset);
}
ALWAYS_INLINE Optional<size_t> find_code_unit_offset(char16_t needle, size_t start_offset = 0) const
{
return utf16_view().find_code_unit_offset(needle, start_offset);
}
ALWAYS_INLINE Optional<size_t> find_code_unit_offset(Utf16View const& needle, size_t start_offset = 0) const
{
return utf16_view().find_code_unit_offset(needle, start_offset);
}
ALWAYS_INLINE Optional<size_t> find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset = 0) const
{
return utf16_view().find_code_unit_offset_ignoring_case(needle, start_offset);
}
[[nodiscard]] ALWAYS_INLINE bool starts_with(Utf16View const& needle) const { return utf16_view().starts_with(needle); }
// This is primarily interesting to unit tests.
[[nodiscard]] constexpr bool has_short_ascii_storage() const
{
if (is_constant_evaluated())
return (m_value.short_ascii_string.byte_count_and_short_string_flag & StringBase::SHORT_STRING_FLAG) != 0;
return (short_ascii_string_without_union_member_assertion().byte_count_and_short_string_flag & StringBase::SHORT_STRING_FLAG) != 0;
}
// This is primarily interesting to unit tests.
[[nodiscard]] ALWAYS_INLINE bool has_long_ascii_storage() const
{
if (has_short_ascii_storage())
return false;
return data_without_union_member_assertion()->has_ascii_storage();
}
// This is primarily interesting to unit tests.
[[nodiscard]] ALWAYS_INLINE bool has_ascii_storage() const
{
return has_short_ascii_storage() || has_long_ascii_storage();
}
// This is primarily interesting to unit tests.
[[nodiscard]] ALWAYS_INLINE bool has_long_utf16_storage() const
{
if (has_short_ascii_storage())
return false;
return data_without_union_member_assertion()->has_utf16_storage();
}
// This is primarily interesting to unit tests.
[[nodiscard]] ALWAYS_INLINE bool has_long_storage() const
{
return !has_short_ascii_storage();
}
protected:
ALWAYS_INLINE void destroy_string() const
{
if (has_long_storage())
data_without_union_member_assertion()->unref();
}
// This is technically **invalid**! See StringBase for details.
ALWAYS_INLINE ShortString const& short_ascii_string_without_union_member_assertion() const { return *__builtin_launder(&m_value.short_ascii_string); }
ALWAYS_INLINE Utf16StringData const* data_without_union_member_assertion() const { return *__builtin_launder(&m_value.data); }
union {
ShortString short_ascii_string;
Utf16StringData const* data;
} m_value;
};
}

148
AK/Utf16StringData.cpp Normal file
View file

@ -0,0 +1,148 @@
/*
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/TypedTransfer.h>
#include <AK/Utf16StringData.h>
#include <AK/Utf32View.h>
#include <AK/Utf8View.h>
#include <simdutf.h>
namespace AK::Detail {
// Due to internal optimizations, we have an explicit maximum string length of 2**63 - 1.
#define VERIFY_UTF16_LENGTH(length) VERIFY(length >> Detail::UTF16_FLAG == 0);
NonnullRefPtr<Utf16StringData> Utf16StringData::create_uninitialized(StorageType storage_type, size_t code_unit_length)
{
auto allocation_size = storage_type == Utf16StringData::StorageType::ASCII
? sizeof(Utf16StringData) + (sizeof(char) * code_unit_length)
: sizeof(Utf16StringData) + (sizeof(char16_t) * code_unit_length);
void* slot = malloc(allocation_size);
VERIFY(slot);
return adopt_ref(*new (slot) Utf16StringData(storage_type, code_unit_length));
}
template<typename ViewType>
NonnullRefPtr<Utf16StringData> Utf16StringData::create_from_code_point_iterable(ViewType const& view)
{
size_t code_unit_length = 0;
size_t code_point_length = 0;
for (auto code_point : view) {
code_unit_length += UnicodeUtils::code_unit_length_for_code_point(code_point);
++code_point_length;
}
VERIFY_UTF16_LENGTH(code_unit_length);
auto string = create_uninitialized(StorageType::UTF16, code_unit_length);
string->m_length_in_code_points = code_point_length;
size_t code_unit_index = 0;
for (auto code_point : view) {
(void)UnicodeUtils::code_point_to_utf16(code_point, [&](auto code_unit) {
string->m_utf16_data[code_unit_index++] = code_unit;
});
}
return string;
}
NonnullRefPtr<Utf16StringData> Utf16StringData::from_utf8(StringView utf8_string, AllowASCIIStorage allow_ascii_storage)
{
RefPtr<Utf16StringData> string;
if (allow_ascii_storage == AllowASCIIStorage::Yes && utf8_string.is_ascii()) {
VERIFY_UTF16_LENGTH(utf8_string.length());
string = create_uninitialized(StorageType::ASCII, utf8_string.length());
TypedTransfer<char>::copy(string->m_ascii_data, utf8_string.characters_without_null_termination(), utf8_string.length());
} else if (Utf8View view { utf8_string }; view.validate(AllowLonelySurrogates::No)) {
auto code_unit_length = simdutf::utf16_length_from_utf8(utf8_string.characters_without_null_termination(), utf8_string.length());
VERIFY_UTF16_LENGTH(code_unit_length);
string = create_uninitialized(StorageType::UTF16, code_unit_length);
auto result = simdutf::convert_utf8_to_utf16(utf8_string.characters_without_null_termination(), utf8_string.length(), string->m_utf16_data);
VERIFY(result == code_unit_length);
} else {
string = create_from_code_point_iterable(view);
}
return string.release_nonnull();
}
NonnullRefPtr<Utf16StringData> Utf16StringData::from_utf16(Utf16View const& utf16_string)
{
VERIFY_UTF16_LENGTH(utf16_string.length_in_code_units());
RefPtr<Utf16StringData> string;
if (utf16_string.has_ascii_storage()) {
string = create_uninitialized(StorageType::ASCII, utf16_string.length_in_code_units());
TypedTransfer<char>::copy(string->m_ascii_data, utf16_string.ascii_span().data(), utf16_string.length_in_code_units());
} else if (utf16_string.is_ascii()) {
string = create_uninitialized(StorageType::ASCII, utf16_string.length_in_code_units());
auto result = simdutf::convert_utf16_to_utf8(utf16_string.utf16_span().data(), utf16_string.length_in_code_units(), string->m_ascii_data);
VERIFY(result == utf16_string.length_in_code_units());
} else {
string = create_uninitialized(StorageType::UTF16, utf16_string.length_in_code_units());
TypedTransfer<char16_t>::copy(string->m_utf16_data, utf16_string.utf16_span().data(), utf16_string.length_in_code_units());
string->m_length_in_code_points = utf16_string.m_length_in_code_points;
}
return string.release_nonnull();
}
NonnullRefPtr<Utf16StringData> Utf16StringData::from_utf32(Utf32View const& utf32_string)
{
RefPtr<Utf16StringData> string;
auto const* utf32_data = reinterpret_cast<char32_t const*>(utf32_string.code_points());
auto utf32_length = utf32_string.length();
if (utf32_string.is_ascii()) {
VERIFY_UTF16_LENGTH(utf32_length);
string = create_uninitialized(StorageType::ASCII, utf32_length);
auto result = simdutf::convert_utf32_to_utf8(utf32_data, utf32_length, string->m_ascii_data);
VERIFY(result == utf32_length);
} else if (simdutf::validate_utf32(utf32_data, utf32_length)) {
auto code_unit_length = simdutf::utf16_length_from_utf32(utf32_data, utf32_length);
VERIFY_UTF16_LENGTH(code_unit_length);
string = create_uninitialized(StorageType::UTF16, code_unit_length);
string->m_length_in_code_points = utf32_length;
auto result = simdutf::convert_utf32_to_utf16(utf32_data, utf32_length, string->m_utf16_data);
VERIFY(result == code_unit_length);
} else {
string = create_from_code_point_iterable(utf32_string);
}
return string.release_nonnull();
}
size_t Utf16StringData::calculate_code_point_length() const
{
ASSERT(!has_ascii_storage());
if (simdutf::validate_utf16(m_utf16_data, length_in_code_units()))
return simdutf::count_utf16(m_utf16_data, length_in_code_units());
size_t code_points = 0;
for ([[maybe_unused]] auto code_point : utf16_view())
++code_points;
return code_points;
}
}

133
AK/Utf16StringData.h Normal file
View file

@ -0,0 +1,133 @@
/*
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/NonnullRefPtr.h>
#include <AK/NumericLimits.h>
#include <AK/RefCounted.h>
#include <AK/Span.h>
#include <AK/StringView.h>
#include <AK/Types.h>
#include <AK/Utf16View.h>
namespace AK::Detail {
class Utf16StringData final : public RefCounted<Utf16StringData> {
public:
enum class StorageType : u8 {
ASCII,
UTF16,
};
enum class AllowASCIIStorage : u8 {
No,
Yes,
};
static NonnullRefPtr<Utf16StringData> from_utf8(StringView, AllowASCIIStorage);
static NonnullRefPtr<Utf16StringData> from_utf16(Utf16View const&);
static NonnullRefPtr<Utf16StringData> from_utf32(Utf32View const&);
~Utf16StringData() = default;
void operator delete(void* ptr)
{
free(ptr);
}
[[nodiscard]] ALWAYS_INLINE bool operator==(Utf16StringData const& other) const
{
return utf16_view() == other.utf16_view();
}
[[nodiscard]] ALWAYS_INLINE bool operator==(Utf16View const& other) const
{
return utf16_view() == other;
}
[[nodiscard]] ALWAYS_INLINE bool operator==(StringView const& other) const
{
if (has_ascii_storage())
return ascii_view() == other;
return utf16_view() == Utf16View { other.characters_without_null_termination(), other.length() };
}
[[nodiscard]] ALWAYS_INLINE bool has_ascii_storage() const { return m_length_in_code_units >> Detail::UTF16_FLAG == 0; }
[[nodiscard]] ALWAYS_INLINE bool has_utf16_storage() const { return m_length_in_code_units >> Detail::UTF16_FLAG != 0; }
ALWAYS_INLINE u32 hash() const
{
if (!m_has_hash)
m_hash = calculate_hash();
return m_hash;
}
[[nodiscard]] ALWAYS_INLINE size_t length_in_code_units() const { return m_length_in_code_units & ~(1uz << Detail::UTF16_FLAG); }
[[nodiscard]] ALWAYS_INLINE size_t length_in_code_points() const
{
if (has_ascii_storage())
return length_in_code_units();
if (m_length_in_code_points == NumericLimits<size_t>::max())
m_length_in_code_points = calculate_code_point_length();
return m_length_in_code_points;
}
[[nodiscard]] ALWAYS_INLINE StringView ascii_view() const
{
ASSERT(has_ascii_storage());
return { m_ascii_data, length_in_code_units() };
}
[[nodiscard]] ALWAYS_INLINE Utf16View utf16_view() const
{
if (has_ascii_storage())
return { m_ascii_data, length_in_code_units() };
Utf16View view { m_utf16_data, length_in_code_units() };
view.m_length_in_code_points = m_length_in_code_points;
return view;
}
private:
ALWAYS_INLINE Utf16StringData(StorageType storage_type, size_t code_unit_length)
: m_length_in_code_units(code_unit_length)
{
if (storage_type == StorageType::UTF16)
m_length_in_code_units |= 1uz << Detail::UTF16_FLAG;
}
static NonnullRefPtr<Utf16StringData> create_uninitialized(StorageType storage_type, size_t code_unit_length);
template<typename ViewType>
static NonnullRefPtr<Utf16StringData> create_from_code_point_iterable(ViewType const&);
[[nodiscard]] size_t calculate_code_point_length() const;
[[nodiscard]] ALWAYS_INLINE u32 calculate_hash() const
{
if (has_ascii_storage())
return ascii_view().hash();
return utf16_view().hash();
}
// We store whether this string has ASCII or UTF-16 storage by setting the most significant bit of m_length_in_code_units
// to 1 for UTF-16 storage. This shrinks the size of most UTF-16 string related classes, at the cost of not being
// allowed to create a string larger than 2**63 - 1.
size_t m_length_in_code_units { 0 };
mutable size_t m_length_in_code_points { NumericLimits<size_t>::max() };
mutable u32 m_hash { 0 };
mutable bool m_has_hash { false };
union {
char m_ascii_data[0];
char16_t m_utf16_data[0];
};
};
}

View file

@ -99,15 +99,19 @@ ErrorOr<String> Utf16View::to_utf8(AllowLonelySurrogates allow_lonely_surrogates
{
if (is_empty())
return String {};
if (has_ascii_storage())
return String::from_utf8_without_validation(bytes());
if (!validate(allow_lonely_surrogates))
return Error::from_string_literal("Input was not valid UTF-16");
if (allow_lonely_surrogates == AllowLonelySurrogates::No) {
String result;
auto utf8_length = simdutf::utf8_length_from_utf16(m_string, length_in_code_units());
auto utf8_length = simdutf::utf8_length_from_utf16(m_string.utf16, length_in_code_units());
TRY(result.replace_with_new_string(Badge<Utf16View> {}, utf8_length, [&](Bytes buffer) -> ErrorOr<void> {
[[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(m_string, length_in_code_units(), reinterpret_cast<char*>(buffer.data()));
[[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(m_string.utf16, length_in_code_units(), reinterpret_cast<char*>(buffer.data()));
ASSERT(result == buffer.size());
return {};
}));
@ -127,17 +131,25 @@ ErrorOr<ByteString> Utf16View::to_byte_string(AllowLonelySurrogates allow_lonely
bool Utf16View::is_ascii() const
{
if (has_ascii_storage())
return true;
// FIXME: Petition simdutf to implement an ASCII validator for UTF-16.
return all_of(span(), AK::is_ascii);
return all_of(utf16_span(), AK::is_ascii);
}
bool Utf16View::validate(size_t& valid_code_units, AllowLonelySurrogates allow_lonely_surrogates) const
{
if (has_ascii_storage()) {
valid_code_units = length_in_code_units();
return true;
}
auto view = *this;
valid_code_units = 0;
while (!view.is_empty()) {
auto result = simdutf::validate_utf16_with_errors(view.m_string, view.length_in_code_units());
auto result = simdutf::validate_utf16_with_errors(view.m_string.utf16, view.length_in_code_units());
valid_code_units += result.count;
if (result.error == simdutf::SUCCESS)
@ -197,7 +209,9 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
return substring_view(code_point_offset, code_point_length);
auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) {
return it.m_iterator - m_string;
if (has_ascii_storage())
return it.m_iterator.ascii - m_string.ascii;
return it.m_iterator.utf16 - m_string.utf16;
};
size_t code_point_index = 0;
@ -220,9 +234,11 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
size_t Utf16View::calculate_length_in_code_points() const
{
ASSERT(!has_ascii_storage());
// simdutf's code point length method assumes valid UTF-16, whereas we allow lonely surrogates.
if (validate(AllowLonelySurrogates::No)) [[likely]]
return simdutf::count_utf16(m_string, length_in_code_units());
return simdutf::count_utf16(m_string.utf16, length_in_code_units());
size_t code_points = 0;
for ([[maybe_unused]] auto code_point : *this)

View file

@ -37,6 +37,13 @@ ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const&);
size_t utf16_code_unit_length_from_utf8(StringView);
namespace Detail {
static constexpr inline auto UTF16_FLAG = NumericLimits<size_t>::digits() - 1;
class Utf16StringBase;
}
class Utf16CodePointIterator {
friend class Utf16View;
@ -46,23 +53,35 @@ public:
constexpr Utf16CodePointIterator& operator++()
{
VERIFY(m_remaining_code_units > 0);
auto remaining_code_units = this->remaining_code_units();
VERIFY(remaining_code_units > 0);
auto length = min(length_in_code_units(), m_remaining_code_units);
m_iterator += length;
if (has_ascii_storage()) {
++m_iterator.ascii;
--m_remaining_code_units;
} else {
auto length = min(length_in_code_units(), remaining_code_units);
m_iterator.utf16 += length;
m_remaining_code_units -= length;
}
return *this;
}
constexpr u32 operator*() const
{
VERIFY(m_remaining_code_units > 0);
auto code_unit = *m_iterator;
auto remaining_code_units = this->remaining_code_units();
VERIFY(remaining_code_units > 0);
if (has_ascii_storage())
return *m_iterator.ascii;
auto code_unit = *m_iterator.utf16;
if (UnicodeUtils::is_utf16_high_surrogate(code_unit)) {
if (m_remaining_code_units > 1) {
auto next_code_unit = *(m_iterator + 1);
if (remaining_code_units > 1) {
auto next_code_unit = *(m_iterator.utf16 + 1);
if (UnicodeUtils::is_utf16_low_surrogate(next_code_unit))
return UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit);
@ -79,22 +98,46 @@ public:
[[nodiscard]] constexpr bool operator==(Utf16CodePointIterator const& other) const
{
return (m_iterator == other.m_iterator) && (m_remaining_code_units == other.m_remaining_code_units);
// Note that this also protects against iterators with different underlying storage.
if (m_remaining_code_units != other.m_remaining_code_units)
return false;
if (has_ascii_storage())
return m_iterator.ascii == other.m_iterator.ascii;
return m_iterator.utf16 == other.m_iterator.utf16;
}
[[nodiscard]] constexpr size_t length_in_code_units() const
[[nodiscard]] ALWAYS_INLINE size_t length_in_code_units()
{
if (has_ascii_storage())
return 1;
return UnicodeUtils::code_unit_length_for_code_point(**this);
}
private:
constexpr Utf16CodePointIterator(char16_t const* ptr, size_t length)
: m_iterator(ptr)
constexpr Utf16CodePointIterator(char const* iterator, size_t length)
: m_iterator { .ascii = iterator }
, m_remaining_code_units(length)
{
}
char16_t const* m_iterator { nullptr };
constexpr Utf16CodePointIterator(char16_t const* iterator, size_t length)
: m_iterator { .utf16 = iterator }
, m_remaining_code_units(length)
{
m_remaining_code_units |= 1uz << Detail::UTF16_FLAG;
}
constexpr bool has_ascii_storage() const { return m_remaining_code_units >> Detail::UTF16_FLAG == 0; }
constexpr size_t remaining_code_units() const { return m_remaining_code_units & ~(1uz << Detail::UTF16_FLAG); }
union {
char const* ascii;
char16_t const* utf16;
} m_iterator { .ascii = nullptr };
// Just like Utf16StringData, we store whether this string has ASCII or UTF-16 storage by setting the most
// significant bit of m_remaining_code_units for UTF-16 storage.
size_t m_remaining_code_units { 0 };
};
@ -106,38 +149,86 @@ public:
~Utf16View() = default;
constexpr Utf16View(char16_t const* string, size_t length_in_code_units)
: m_string(string)
: m_string { .utf16 = string }
, m_length_in_code_units(length_in_code_units)
{
m_length_in_code_units |= 1uz << Detail::UTF16_FLAG;
}
constexpr Utf16View(Utf16Data const& string)
: m_string(string.data())
: m_string { .utf16 = string.data() }
, m_length_in_code_units(string.size())
{
m_length_in_code_units |= 1uz << Detail::UTF16_FLAG;
}
consteval Utf16View(StringView string)
: m_string { .ascii = string.characters_without_null_termination() }
, m_length_in_code_units(string.length())
{
VERIFY(all_of(string, AK::is_ascii));
}
Utf16View(Utf16ConversionResult&&) = delete;
explicit Utf16View(Utf16ConversionResult const& conversion_result)
: m_string(conversion_result.data.data())
: m_string { .utf16 = conversion_result.data.data() }
, m_length_in_code_units(conversion_result.data.size())
, m_length_in_code_points(conversion_result.code_point_count)
{
m_length_in_code_units |= 1uz << Detail::UTF16_FLAG;
}
ErrorOr<String> to_utf8(AllowLonelySurrogates = AllowLonelySurrogates::Yes) const;
ErrorOr<ByteString> to_byte_string(AllowLonelySurrogates = AllowLonelySurrogates::Yes) const;
[[nodiscard]] constexpr ReadonlySpan<char16_t> span() const
ALWAYS_INLINE String to_utf8_but_should_be_ported_to_utf16(AllowLonelySurrogates allow_lonely_surrogates = AllowLonelySurrogates::Yes) const
{
return { m_string, length_in_code_units() };
return MUST(to_utf8(allow_lonely_surrogates));
}
[[nodiscard]] ALWAYS_INLINE bool has_ascii_storage() const { return m_length_in_code_units >> Detail::UTF16_FLAG == 0; }
[[nodiscard]] constexpr ReadonlyBytes bytes() const
{
VERIFY(has_ascii_storage());
return { m_string.ascii, length_in_code_units() };
}
[[nodiscard]] constexpr ReadonlySpan<char> ascii_span() const
{
VERIFY(has_ascii_storage());
return { m_string.ascii, length_in_code_units() };
}
[[nodiscard]] constexpr ReadonlySpan<char16_t> utf16_span() const
{
VERIFY(!has_ascii_storage());
return { m_string.utf16, length_in_code_units() };
}
[[nodiscard]] constexpr bool operator==(Utf16View const& other) const
{
if (length_in_code_units() != other.length_in_code_units())
return false;
return TypedTransfer<char16_t>::compare(m_string, other.m_string, length_in_code_units());
if (has_ascii_storage() && other.has_ascii_storage())
return TypedTransfer<char>::compare(m_string.ascii, other.m_string.ascii, length_in_code_units());
if (!has_ascii_storage() && !other.has_ascii_storage())
return TypedTransfer<char16_t>::compare(m_string.utf16, other.m_string.utf16, length_in_code_units());
for (size_t i = 0; i < length_in_code_units(); ++i) {
if (code_unit_at(i) != other.code_unit_at(i))
return false;
}
return true;
}
[[nodiscard]] constexpr bool operator==(StringView other) const
{
if (has_ascii_storage())
return bytes() == other.bytes();
return *this == Utf16View { other.characters_without_null_termination(), other.length() };
}
[[nodiscard]] constexpr bool equals_ignoring_case(Utf16View const& other) const
@ -175,10 +266,18 @@ public:
{
if (is_empty())
return 0;
return string_hash(reinterpret_cast<char const*>(m_string), length_in_code_units() * sizeof(char16_t));
if (has_ascii_storage())
return string_hash(m_string.ascii, length_in_code_units());
return string_hash(reinterpret_cast<char const*>(m_string.utf16), length_in_code_units() * sizeof(char16_t));
}
[[nodiscard]] constexpr bool is_null() const
{
if (has_ascii_storage())
return m_string.ascii == nullptr;
return m_string.utf16 == nullptr;
}
[[nodiscard]] constexpr bool is_null() const { return m_string == nullptr; }
[[nodiscard]] constexpr bool is_empty() const { return length_in_code_units() == 0; }
[[nodiscard]] bool is_ascii() const;
@ -190,10 +289,13 @@ public:
[[nodiscard]] bool validate(size_t& valid_code_units, AllowLonelySurrogates = AllowLonelySurrogates::Yes) const;
[[nodiscard]] constexpr size_t length_in_code_units() const { return m_length_in_code_units; }
[[nodiscard]] constexpr size_t length_in_code_units() const { return m_length_in_code_units & ~(1uz << Detail::UTF16_FLAG); }
[[nodiscard]] ALWAYS_INLINE size_t length_in_code_points() const
{
if (has_ascii_storage())
return m_length_in_code_units;
if (m_length_in_code_points == NumericLimits<size_t>::max())
m_length_in_code_points = calculate_length_in_code_points();
return m_length_in_code_points;
@ -201,6 +303,9 @@ public:
constexpr Optional<size_t> length_in_code_points_if_known() const
{
if (has_ascii_storage())
return m_length_in_code_units;
if (m_length_in_code_points == NumericLimits<size_t>::max())
return {};
return m_length_in_code_points;
@ -211,7 +316,10 @@ public:
[[nodiscard]] constexpr char16_t code_unit_at(size_t index) const
{
VERIFY(index < length_in_code_units());
return m_string[index];
if (has_ascii_storage())
return m_string.ascii[index];
return m_string.utf16[index];
}
[[nodiscard]] constexpr u32 code_point_at(size_t index) const
@ -236,18 +344,25 @@ public:
[[nodiscard]] constexpr Utf16CodePointIterator begin() const
{
return { m_string, length_in_code_units() };
if (has_ascii_storage())
return { m_string.ascii, length_in_code_units() };
return { m_string.utf16, length_in_code_units() };
}
[[nodiscard]] constexpr Utf16CodePointIterator end() const
{
return { m_string + length_in_code_units(), 0 };
if (has_ascii_storage())
return { m_string.ascii + length_in_code_units(), 0 };
return { m_string.utf16 + length_in_code_units(), 0 };
}
[[nodiscard]] constexpr Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const
{
VERIFY(code_unit_offset + code_unit_length <= length_in_code_units());
return { m_string + code_unit_offset, code_unit_length };
if (has_ascii_storage())
return { m_string.ascii + code_unit_offset, code_unit_length };
return { m_string.utf16 + code_unit_offset, code_unit_length };
}
[[nodiscard]] constexpr Utf16View substring_view(size_t code_unit_offset) const { return substring_view(code_unit_offset, length_in_code_units() - code_unit_offset); }
@ -259,12 +374,42 @@ public:
{
if (start_offset >= length_in_code_units())
return {};
return AK::memmem_optional(m_string + start_offset, (length_in_code_units() - start_offset) * sizeof(char16_t), &needle, sizeof(needle));
if (has_ascii_storage()) {
if (!AK::is_ascii(needle))
return false;
auto byte = static_cast<char>(needle);
return AK::memmem_optional(m_string.ascii + start_offset, length_in_code_units() - start_offset, &byte, sizeof(byte));
}
return AK::memmem_optional(m_string.utf16 + start_offset, (length_in_code_units() - start_offset) * sizeof(char16_t), &needle, sizeof(needle));
}
constexpr Optional<size_t> find_code_unit_offset(Utf16View const& needle, size_t start_offset = 0) const
{
return span().index_of(needle.span(), start_offset);
if (has_ascii_storage() && needle.has_ascii_storage())
return ascii_span().index_of(needle.ascii_span(), start_offset);
if (!has_ascii_storage() && !needle.has_ascii_storage())
return utf16_span().index_of(needle.utf16_span(), start_offset);
Checked maximum_offset { start_offset };
maximum_offset += needle.length_in_code_units();
if (maximum_offset.has_overflow() || maximum_offset.value() > length_in_code_units())
return {};
if (needle.is_empty())
return start_offset;
for (size_t index = start_offset; index <= length_in_code_units() - needle.length_in_code_units();) {
auto slice = substring_view(index, needle.length_in_code_units());
if (slice == needle)
return index;
index += slice.begin().length_in_code_units();
}
return {};
}
constexpr Optional<size_t> find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset = 0) const
@ -298,9 +443,24 @@ public:
if (needle.length_in_code_units() > length_in_code_units())
return false;
if (m_string == needle.m_string)
if (has_ascii_storage() && needle.has_ascii_storage()) {
if (m_string.ascii == needle.m_string.ascii)
return true;
return ascii_span().starts_with(needle.ascii_span());
}
if (!has_ascii_storage() && !needle.has_ascii_storage()) {
if (m_string.utf16 == needle.m_string.utf16)
return true;
return utf16_span().starts_with(needle.utf16_span());
}
for (auto this_it = begin(), needle_it = needle.begin(); needle_it != needle.end(); ++needle_it, ++this_it) {
if (*this_it != *needle_it)
return false;
}
return true;
return span().starts_with(needle.span());
}
// https://infra.spec.whatwg.org/#code-unit-less-than
@ -320,9 +480,24 @@ public:
}
private:
friend Detail::Utf16StringBase;
friend Detail::Utf16StringData;
constexpr Utf16View(char const* string, size_t length_in_code_units)
: m_string { .ascii = string }
, m_length_in_code_units(length_in_code_units)
{
}
[[nodiscard]] size_t calculate_length_in_code_points() const;
char16_t const* m_string { nullptr };
union {
char const* ascii;
char16_t const* utf16;
} m_string { .ascii = nullptr };
// Just like Utf16StringData, we store whether this string has ASCII or UTF-16 storage by setting the most
// significant bit of m_code_unit_length for UTF-16 storage.
size_t m_length_in_code_units { 0 };
mutable size_t m_length_in_code_points { NumericLimits<size_t>::max() };
};
@ -342,6 +517,16 @@ struct Traits<Utf16View> : public DefaultTraits<Utf16View> {
static unsigned hash(Utf16View const& s) { return s.hash(); }
};
namespace Detail {
template<>
inline constexpr bool IsHashCompatible<Utf16View, Utf16String> = true;
template<>
inline constexpr bool IsHashCompatible<Utf16String, Utf16View> = true;
}
}
[[nodiscard]] ALWAYS_INLINE AK_STRING_VIEW_LITERAL_CONSTEVAL AK::Utf16View operator""sv(char16_t const* string, size_t length)

View file

@ -6,7 +6,9 @@
#pragma once
#include <AK/AllOf.h>
#include <AK/Assertions.h>
#include <AK/CharacterTypes.h>
#include <AK/Checked.h>
#include <AK/Format.h>
#include <AK/Types.h>
@ -99,6 +101,12 @@ public:
bool is_null() const { return !m_code_points; }
size_t length() const { return m_length; }
bool is_ascii() const
{
// FIXME: Petition simdutf to implement an ASCII validator for UTF-32.
return all_of(*this, AK::is_ascii);
}
size_t iterator_offset(Utf32CodePointIterator const& it) const
{
VERIFY(it.m_ptr >= m_code_points);

View file

@ -1451,7 +1451,7 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
auto ref_length = ref.length_in_code_units();
// k. Set result to the string-concatenation of result and refReplacement.
result.append(ref_replacement.span().data(), ref_replacement.length_in_code_units());
result.append(ref_replacement.utf16_span().data(), ref_replacement.length_in_code_units());
// j. Set templateRemainder to the substring of templateRemainder from refLength.
// NOTE: We do this step last because refReplacement may point to templateRemainder.

View file

@ -44,7 +44,13 @@ NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16View const& view)
{
Utf16Data string;
string.ensure_capacity(view.length_in_code_units());
string.unchecked_append(view.span().data(), view.length_in_code_units());
if (view.has_ascii_storage()) {
for (size_t i = 0; i < view.length_in_code_units(); ++i)
string.unchecked_append(static_cast<char16_t>(view.code_unit_at(i)));
} else {
string.unchecked_append(view.utf16_span().data(), view.length_in_code_units());
}
auto impl = create(move(string));
if (auto length_in_code_points = view.length_in_code_points_if_known(); length_in_code_points.has_value())

View file

@ -75,7 +75,12 @@ public:
virtual void set_segmented_text(Utf16View const& text) override
{
m_segmented_text = icu::UnicodeString { text.span().data(), static_cast<i32>(text.length_in_code_units()) };
if (text.has_ascii_storage()) {
set_segmented_text(MUST(text.to_utf8()));
return;
}
m_segmented_text = icu::UnicodeString { text.utf16_span().data(), static_cast<i32>(text.length_in_code_units()) };
m_segmenter->setText(m_segmented_text.get<icu::UnicodeString>());
}

View file

@ -89,9 +89,9 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
Utf16Data full_data;
full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units());
full_data.append(before_data.span().data(), before_data.length_in_code_units());
full_data.append(before_data.utf16_span().data(), before_data.length_in_code_units());
full_data.extend(inserted_data_result.data);
full_data.append(after_data.span().data(), after_data.length_in_code_units());
full_data.append(after_data.utf16_span().data(), after_data.length_in_code_units());
Utf16View full_view { full_data };
bool characters_are_the_same = utf16_view == full_view;

View file

@ -76,6 +76,7 @@ set(AK_TEST_SOURCES
TestTypeTraits.cpp
TestTypedTransfer.cpp
TestUFixedBigInt.cpp
TestUtf16String.cpp
TestUtf16View.cpp
TestUtf8View.cpp
TestVariant.cpp

View file

@ -0,0 +1,516 @@
/*
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibTest/TestCase.h>
#include <AK/Array.h>
#include <AK/CharacterTypes.h>
#include <AK/Enumerate.h>
#include <AK/StringBuilder.h>
#include <AK/Utf16String.h>
#include <AK/Utf32View.h>
static Utf16String make_copy(Utf16String const& string)
{
return string.has_ascii_storage()
? Utf16String::from_utf8(string.ascii_view())
: Utf16String::from_utf16(string.utf16_view());
}
TEST_CASE(empty_string)
{
Utf16String string {};
EXPECT(string.is_empty());
EXPECT(string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 0uz);
EXPECT_EQ(string.length_in_code_points(), 0uz);
EXPECT_EQ(string.ascii_view(), StringView {});
}
TEST_CASE(from_utf8)
{
{
auto string = Utf16String::from_utf8("hello!"sv);
EXPECT(!string.is_empty());
EXPECT(string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 6uz);
EXPECT_EQ(string.length_in_code_points(), 6uz);
EXPECT_EQ(string.ascii_view(), "hello!"sv);
}
{
auto string = Utf16String::from_utf8("hello there!"sv);
EXPECT(!string.is_empty());
EXPECT(string.is_ascii());
EXPECT(string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 12uz);
EXPECT_EQ(string.length_in_code_points(), 12uz);
EXPECT_EQ(string.ascii_view(), "hello there!"sv);
}
{
auto string = Utf16String::from_utf8("😀"sv);
EXPECT(!string.is_empty());
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 2uz);
EXPECT_EQ(string.length_in_code_points(), 1uz);
EXPECT_EQ(string.utf16_view(), u"😀"sv);
}
{
auto string = Utf16String::from_utf8("hello 😀 there!"sv);
EXPECT(!string.is_empty());
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 15uz);
EXPECT_EQ(string.length_in_code_points(), 14uz);
EXPECT_EQ(string.utf16_view(), u"hello 😀 there!"sv);
}
{
auto string = Utf16String::from_utf8("hello \xed\xa0\x80!"sv);
EXPECT(!string.is_empty());
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 8uz);
EXPECT_EQ(string.length_in_code_points(), 8uz);
EXPECT_EQ(string.utf16_view(), u"hello \xd800!"sv);
}
{
auto string = Utf16String::from_utf8("hello \xed\xb0\x80!"sv);
EXPECT(!string.is_empty());
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 8uz);
EXPECT_EQ(string.length_in_code_points(), 8uz);
EXPECT_EQ(string.utf16_view(), u"hello \xdc00!"sv);
}
}
TEST_CASE(from_utf16)
{
{
auto string = Utf16String::from_utf16(u"hello!"sv);
EXPECT(!string.is_empty());
EXPECT(string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 6uz);
EXPECT_EQ(string.length_in_code_points(), 6uz);
EXPECT_EQ(string.ascii_view(), "hello!"sv);
}
{
auto string = Utf16String::from_utf16(u"hello there!"sv);
EXPECT(!string.is_empty());
EXPECT(string.is_ascii());
EXPECT(string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 12uz);
EXPECT_EQ(string.length_in_code_points(), 12uz);
EXPECT_EQ(string.ascii_view(), "hello there!"sv);
}
{
auto string = Utf16String::from_utf16(u"😀"sv);
EXPECT(!string.is_empty());
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 2uz);
EXPECT_EQ(string.length_in_code_points(), 1uz);
EXPECT_EQ(string.utf16_view(), u"😀"sv);
}
{
auto string = Utf16String::from_utf16(u"hello 😀 there!"sv);
EXPECT(!string.is_empty());
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 15uz);
EXPECT_EQ(string.length_in_code_points(), 14uz);
EXPECT_EQ(string.utf16_view(), u"hello 😀 there!"sv);
}
{
auto string = Utf16String::from_utf16(u"hello \xd800!"sv);
EXPECT(!string.is_empty());
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 8uz);
EXPECT_EQ(string.length_in_code_points(), 8uz);
EXPECT_EQ(string.utf16_view(), u"hello \xd800!"sv);
}
{
auto string = Utf16String::from_utf16(u"hello \xdc00!"sv);
EXPECT(!string.is_empty());
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 8uz);
EXPECT_EQ(string.length_in_code_points(), 8uz);
EXPECT_EQ(string.utf16_view(), u"hello \xdc00!"sv);
}
}
TEST_CASE(from_utf32)
{
auto strlen32 = [](char32_t const* string) {
auto const* start = string;
while (*start)
++start;
return static_cast<size_t>(start - string);
};
auto to_utf32_view = [&](char32_t const* string) {
return Utf32View { reinterpret_cast<u32 const*>(string), strlen32(string) };
};
{
auto string = Utf16String::from_utf32(to_utf32_view(U"hello!"));
EXPECT(!string.is_empty());
EXPECT(string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 6uz);
EXPECT_EQ(string.length_in_code_points(), 6uz);
EXPECT_EQ(string.ascii_view(), "hello!"sv);
}
{
auto string = Utf16String::from_utf32(to_utf32_view(U"hello there!"));
EXPECT(!string.is_empty());
EXPECT(string.is_ascii());
EXPECT(string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 12uz);
EXPECT_EQ(string.length_in_code_points(), 12uz);
EXPECT_EQ(string.ascii_view(), "hello there!"sv);
}
{
auto string = Utf16String::from_utf32(to_utf32_view(U"😀"));
EXPECT(!string.is_empty());
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 2uz);
EXPECT_EQ(string.length_in_code_points(), 1uz);
EXPECT_EQ(string.utf16_view(), u"😀"sv);
}
{
auto string = Utf16String::from_utf32(to_utf32_view(U"hello 😀 there!"));
EXPECT(!string.is_empty());
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 15uz);
EXPECT_EQ(string.length_in_code_points(), 14uz);
EXPECT_EQ(string.utf16_view(), u"hello 😀 there!"sv);
}
{
auto string = Utf16String::from_utf32(to_utf32_view(U"hello \xd800!"));
EXPECT(!string.is_empty());
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 8uz);
EXPECT_EQ(string.length_in_code_points(), 8uz);
EXPECT_EQ(string.utf16_view(), u"hello \xd800!"sv);
}
{
auto string = Utf16String::from_utf32(to_utf32_view(U"hello \xdc00!"));
EXPECT(!string.is_empty());
EXPECT(!string.is_ascii());
EXPECT(!string.has_long_ascii_storage());
EXPECT(!string.has_short_ascii_storage());
EXPECT_EQ(string.length_in_code_units(), 8uz);
EXPECT_EQ(string.length_in_code_points(), 8uz);
EXPECT_EQ(string.utf16_view(), u"hello \xdc00!"sv);
}
}
TEST_CASE(copy_operations)
{
auto test = [](Utf16String const& string1) {
auto original = make_copy(string1);
// Copy constructor.
Utf16String string2(string1);
EXPECT_EQ(string1, original);
EXPECT_EQ(string1, string2);
// Copy assignment.
Utf16String string3;
string3 = string1;
EXPECT_EQ(string1, original);
EXPECT_EQ(string1, string3);
};
test({});
test("hello"_utf16);
test("hello there general!"_utf16);
test("hello 😀 there!"_utf16);
}
TEST_CASE(move_operations)
{
auto test = [](Utf16String string1) {
auto original = make_copy(string1);
// Move constructor.
Utf16String string2(move(string1));
EXPECT(string1.is_empty());
EXPECT_EQ(string1, Utf16String {});
EXPECT_EQ(string2, original);
// Move assignment.
Utf16String string3;
string3 = move(string2);
EXPECT(string2.is_empty());
EXPECT_EQ(string2, Utf16String {});
EXPECT_EQ(string3, original);
};
test({});
test("hello"_utf16);
test("hello there general!"_utf16);
test("hello 😀 there!"_utf16);
}
TEST_CASE(equals)
{
auto test = [](Utf16String const& string1, Utf16String const& inequal_string) {
auto string2 = make_copy(string1);
EXPECT_EQ(string1, string1);
EXPECT_EQ(string1, string2);
EXPECT_EQ(string2, string1);
EXPECT_EQ(string2, string2);
if (string1.has_long_utf16_storage()) {
EXPECT_EQ(string1, string1.utf16_view());
EXPECT_EQ(string1, string2.utf16_view());
EXPECT_EQ(string2, string1.utf16_view());
EXPECT_EQ(string2, string2.utf16_view());
EXPECT_EQ(string1.utf16_view(), string1);
EXPECT_EQ(string1.utf16_view(), string2);
EXPECT_EQ(string2.utf16_view(), string1);
EXPECT_EQ(string2.utf16_view(), string2);
}
EXPECT_NE(string1, inequal_string);
EXPECT_NE(string2, inequal_string);
EXPECT_NE(inequal_string, string1);
EXPECT_NE(inequal_string, string2);
if (string1.has_long_utf16_storage()) {
EXPECT_NE(string1, inequal_string.utf16_view());
EXPECT_NE(string2, inequal_string.utf16_view());
EXPECT_NE(inequal_string, string1.utf16_view());
EXPECT_NE(inequal_string, string2.utf16_view());
EXPECT_NE(string1.utf16_view(), inequal_string);
EXPECT_NE(string2.utf16_view(), inequal_string);
EXPECT_NE(inequal_string.utf16_view(), string1);
EXPECT_NE(inequal_string.utf16_view(), string2);
}
};
// Short (empty) ASCII string comparison.
test(Utf16String {}, "hello"_utf16);
// Short ASCII string comparison.
test("hello"_utf16, "there"_utf16);
// Short and long ASCII string comparison.
test("hello"_utf16, "hello there general!"_utf16);
// Long ASCII string comparison.
test("hello there!"_utf16, "hello there general!"_utf16);
// UTF-16 string comparison.
test("😀"_utf16, "hello 😀"_utf16);
// Short ASCII and UTF-16 string comparison.
test("hello"_utf16, "😀"_utf16);
// Short ASCII and UTF-16 string of same code unit length comparison.
test("ab"_utf16, "😀"_utf16);
// Long ASCII and UTF-16 string comparison.
test("hello there general!"_utf16, "😀"_utf16);
// Long ASCII and UTF-16 string of same code unit length comparison.
test("ababababab"_utf16, "😀😀😀😀😀"_utf16);
}
TEST_CASE(equals_ascii)
{
auto test = [](StringView ascii, Utf16String const& inequal_string) {
auto string = Utf16String::from_utf8(ascii);
EXPECT_EQ(ascii, string);
EXPECT_EQ(string, ascii);
EXPECT_NE(ascii, inequal_string);
EXPECT_NE(inequal_string, ascii);
};
// Short (empty) ASCII string comparison.
test({}, "hello"_utf16);
// Short ASCII string comparison.
test("hello"sv, "there"_utf16);
// Short and long ASCII string comparison.
test("hello"sv, "hello there general!"_utf16);
// Long ASCII string comparison.
test("hello there!"sv, "hello there general!"_utf16);
// Short ASCII and UTF-16 string comparison.
test("hello"sv, "😀"_utf16);
// Short ASCII and UTF-16 string of same code unit length comparison.
test("ab"sv, "😀"_utf16);
// Long ASCII and UTF-16 string comparison.
test("hello there general!"sv, "😀"_utf16);
// Long ASCII and UTF-16 string of same code unit length comparison.
test("ababababab"sv, "😀😀😀😀😀"_utf16);
// Non-ASCII string comparison.
EXPECT_NE("😀"sv, "😀"_utf16);
}
TEST_CASE(equals_ignoring_ascii_case)
{
auto test = [](Utf16String const& string1, Utf16String const& inequal_string) {
StringBuilder builder;
for (auto [i, code_point] : enumerate(string1))
builder.append_code_point(i % 2 == 0 ? to_ascii_uppercase(code_point) : code_point);
auto string2 = Utf16String::from_utf8(builder.string_view());
EXPECT(string1.equals_ignoring_ascii_case(string1));
EXPECT(string1.equals_ignoring_ascii_case(string2));
EXPECT(string2.equals_ignoring_ascii_case(string1));
EXPECT(string2.equals_ignoring_ascii_case(string2));
if (string1.has_long_utf16_storage()) {
EXPECT(string1.equals_ignoring_ascii_case(string1.utf16_view()));
EXPECT(string1.equals_ignoring_ascii_case(string2.utf16_view()));
EXPECT(string2.equals_ignoring_ascii_case(string1.utf16_view()));
EXPECT(string2.equals_ignoring_ascii_case(string2.utf16_view()));
}
EXPECT(!string1.equals_ignoring_ascii_case(inequal_string));
EXPECT(!string2.equals_ignoring_ascii_case(inequal_string));
EXPECT(!inequal_string.equals_ignoring_ascii_case(string1));
EXPECT(!inequal_string.equals_ignoring_ascii_case(string2));
if (string1.has_long_utf16_storage()) {
EXPECT(!string1.equals_ignoring_ascii_case(inequal_string.utf16_view()));
EXPECT(!string2.equals_ignoring_ascii_case(inequal_string.utf16_view()));
EXPECT(!inequal_string.equals_ignoring_ascii_case(string1.utf16_view()));
EXPECT(!inequal_string.equals_ignoring_ascii_case(string2.utf16_view()));
}
};
// Short (empty) ASCII string comparison.
test(Utf16String {}, "hello"_utf16);
// Short ASCII string comparison.
test("hello"_utf16, "there"_utf16);
// Short and long ASCII string comparison.
test("hello"_utf16, "hello there general!"_utf16);
// Long ASCII string comparison.
test("hello there!"_utf16, "hello there general!"_utf16);
// UTF-16 string comparison.
test("😀"_utf16, "hello 😀"_utf16);
// Short ASCII and UTF-16 string comparison.
test("hello"_utf16, "😀"_utf16);
// Short ASCII and UTF-16 string of same code unit length comparison.
test("ab"_utf16, "😀"_utf16);
// Long ASCII and UTF-16 string comparison.
test("hello there general!"_utf16, "😀"_utf16);
// Long ASCII and UTF-16 string of same code unit length comparison.
test("ababababab"_utf16, "😀😀😀😀😀"_utf16);
}
TEST_CASE(iteration)
{
auto test = [](Utf16String const& string, ReadonlySpan<u32> code_points) {
EXPECT_EQ(string.length_in_code_points(), code_points.size());
for (auto [i, code_point] : enumerate(string)) {
if (code_points.size() == 0)
FAIL("Iterating an empty UTF-16 string should not produce any values");
else
EXPECT_EQ(code_point, code_points[i]);
}
auto iterator = string.end();
EXPECT_DEATH("Dereferencing a UTF-16 iterator which is at its end", *iterator);
EXPECT_DEATH("Incrementing a UTF-16 iterator which is at its end", ++iterator);
};
test({}, {});
test("hello"_utf16, { { 'h', 'e', 'l', 'l', 'o' } });
test("hello there general!"_utf16, { { 'h', 'e', 'l', 'l', 'o', ' ', 't', 'h', 'e', 'r', 'e', ' ', 'g', 'e', 'n', 'e', 'r', 'a', 'l', '!' } });
test("😀"_utf16, { { 0x1f600 } });
test("hello 😀 there!"_utf16, { { 'h', 'e', 'l', 'l', 'o', ' ', 0x1f600, ' ', 't', 'h', 'e', 'r', 'e', '!' } });
}
TEST_CASE(code_unit_at)
{
auto test = [](Utf16View const& view, size_t length_in_code_units) {
auto string = Utf16String::from_utf16(view);
EXPECT_EQ(string.length_in_code_units(), length_in_code_units);
for (size_t i = 0; i < length_in_code_units; ++i)
EXPECT_EQ(string.code_unit_at(i), view.code_unit_at(i));
};
test({}, 0);
test(u"hello"sv, 5);
test(u"hello there general!"sv, 20);
test(u"😀"sv, 2);
test(u"hello 😀 there!"sv, 15);
}
TEST_CASE(code_point_at)
{
auto test = [](Utf16View const& view, size_t length_in_code_points) {
auto string = Utf16String::from_utf16(view);
EXPECT_EQ(string.length_in_code_points(), length_in_code_points);
for (size_t i = 0; i < string.length_in_code_units(); ++i)
EXPECT_EQ(string.code_point_at(i), view.code_point_at(i));
};
test({}, 0);
test(u"hello"sv, 5);
test(u"hello there general!"sv, 20);
test(u"😀"sv, 1);
test(u"hello 😀 there!"sv, 14);
}