mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-07-29 12:19:54 +00:00
AK: Add a UTF-16 string with optimized short- and ASCII-string storage
This is a strictly UTF-16 string with some optimizations for ASCII.
* If created from a short UTF-8 or UTF-16 string that is also ASCII,
then the string is stored in an inlined byte buffer.
* If created with a long UTF-8 or UTF-16 string that is also ASCII,
then the string is stored in an outlined char buffer.
* If created with a short or long UTF-8 or UTF-16 string that is not
ASCII, then the string is stored in an outlined char16 buffer.
We do not store short non-ASCII text in the inlined buffer to avoid
confusion with operations such as `length_in_code_units` and
`code_unit_at`. For example, "😀" would be stored as 4 UTF-8 bytes
in short string form. But we still want `length_in_code_units` to
be 2, and `code_unit_at(0)` to be 0xD83D.
This commit is contained in:
parent
8fbb80fffc
commit
fe676585f5
Notes:
github-actions[bot]
2025-07-18 16:47:31 +00:00
Author: https://github.com/trflynn89
Commit: fe676585f5
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5388
Reviewed-by: https://github.com/shannonbooth ✅
17 changed files with 1527 additions and 44 deletions
|
@ -29,6 +29,8 @@ set(SOURCES
|
|||
StringUtils.cpp
|
||||
StringView.cpp
|
||||
Time.cpp
|
||||
Utf16String.cpp
|
||||
Utf16StringData.cpp
|
||||
Utf16View.cpp
|
||||
Utf32View.cpp
|
||||
Utf8View.cpp
|
||||
|
|
|
@ -19,6 +19,7 @@ template<size_t inline_capacity>
|
|||
class ByteBuffer;
|
||||
|
||||
class StringData;
|
||||
class Utf16StringData;
|
||||
|
||||
}
|
||||
|
||||
|
@ -52,6 +53,7 @@ class String;
|
|||
class StringBuilder;
|
||||
class StringView;
|
||||
class UnixDateTime;
|
||||
class Utf16String;
|
||||
class Utf16View;
|
||||
class Utf32CodePointIterator;
|
||||
class Utf32View;
|
||||
|
@ -198,6 +200,7 @@ using AK::StringView;
|
|||
using AK::TrailingCodePointTransformation;
|
||||
using AK::Traits;
|
||||
using AK::UnixDateTime;
|
||||
using AK::Utf16String;
|
||||
using AK::Utf16View;
|
||||
using AK::Utf32CodePointIterator;
|
||||
using AK::Utf32View;
|
||||
|
|
|
@ -249,8 +249,10 @@ ErrorOr<void> StringBuilder::try_append(Utf16View const& utf16_view)
|
|||
{
|
||||
if (utf16_view.is_empty())
|
||||
return {};
|
||||
if (utf16_view.has_ascii_storage())
|
||||
return try_append(utf16_view.bytes());
|
||||
|
||||
auto remaining_view = utf16_view.span();
|
||||
auto remaining_view = utf16_view.utf16_span();
|
||||
auto maximum_utf8_length = UnicodeUtils::maximum_utf8_length_from_utf16(remaining_view);
|
||||
|
||||
// Possibly over-allocate a little to ensure we don't have to allocate later.
|
||||
|
|
73
AK/Utf16String.cpp
Normal file
73
AK/Utf16String.cpp
Normal file
|
@ -0,0 +1,73 @@
|
|||
/*
|
||||
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <AK/Utf16String.h>
|
||||
#include <AK/Utf32View.h>
|
||||
|
||||
#include <simdutf.h>
|
||||
|
||||
namespace AK {
|
||||
|
||||
static_assert(sizeof(Detail::ShortString) == sizeof(Detail::Utf16StringData*));
|
||||
|
||||
Utf16String Utf16String::from_utf8_without_validation(StringView utf8_string)
|
||||
{
|
||||
if (utf8_string.length() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf8_string.is_ascii()) {
|
||||
Utf16String string;
|
||||
string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(utf8_string.length());
|
||||
|
||||
auto result = utf8_string.bytes().copy_to(string.m_value.short_ascii_string.storage);
|
||||
VERIFY(result == utf8_string.length());
|
||||
|
||||
return string;
|
||||
}
|
||||
|
||||
return Utf16String { Detail::Utf16StringData::from_utf8(utf8_string, Detail::Utf16StringData::AllowASCIIStorage::Yes) };
|
||||
}
|
||||
|
||||
Utf16String Utf16String::from_utf16_without_validation(Utf16View const& utf16_string)
|
||||
{
|
||||
if (utf16_string.length_in_code_units() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf16_string.is_ascii()) {
|
||||
Utf16String string;
|
||||
string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(utf16_string.length_in_code_units());
|
||||
|
||||
if (utf16_string.has_ascii_storage()) {
|
||||
auto result = utf16_string.bytes().copy_to(string.m_value.short_ascii_string.storage);
|
||||
VERIFY(result == utf16_string.length_in_code_units());
|
||||
} else {
|
||||
auto result = simdutf::convert_utf16_to_utf8(utf16_string.utf16_span().data(), utf16_string.length_in_code_units(), reinterpret_cast<char*>(string.m_value.short_ascii_string.storage));
|
||||
VERIFY(result == utf16_string.length_in_code_units());
|
||||
}
|
||||
|
||||
return string;
|
||||
}
|
||||
|
||||
return Utf16String { Detail::Utf16StringData::from_utf16(utf16_string) };
|
||||
}
|
||||
|
||||
Utf16String Utf16String::from_utf32(Utf32View const& utf32_string)
|
||||
{
|
||||
if (utf32_string.length() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf32_string.is_ascii()) {
|
||||
Utf16String string;
|
||||
string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(utf32_string.length());
|
||||
|
||||
auto result = simdutf::convert_utf32_to_utf8(reinterpret_cast<char32_t const*>(utf32_string.code_points()), utf32_string.length(), reinterpret_cast<char*>(string.m_value.short_ascii_string.storage));
|
||||
VERIFY(result == utf32_string.length());
|
||||
|
||||
return string;
|
||||
}
|
||||
|
||||
return Utf16String { Detail::Utf16StringData::from_utf32(utf32_string) };
|
||||
}
|
||||
|
||||
ErrorOr<void> Formatter<Utf16String>::format(FormatBuilder& builder, Utf16String const& utf16_string)
|
||||
{
|
||||
if (utf16_string.has_long_utf16_storage())
|
||||
return builder.builder().try_append(utf16_string.utf16_view());
|
||||
return builder.put_string(utf16_string.ascii_view());
|
||||
}
|
||||
|
||||
}
|
117
AK/Utf16String.h
Normal file
117
AK/Utf16String.h
Normal file
|
@ -0,0 +1,117 @@
|
|||
/*
|
||||
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Badge.h>
|
||||
#include <AK/Error.h>
|
||||
#include <AK/Format.h>
|
||||
#include <AK/NonnullRefPtr.h>
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Traits.h>
|
||||
#include <AK/UnicodeUtils.h>
|
||||
#include <AK/Utf16StringBase.h>
|
||||
#include <AK/Utf16StringData.h>
|
||||
#include <AK/Utf16View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
|
||||
namespace AK {
|
||||
|
||||
// Utf16String is a strongly owned sequence of Unicode code points encoded as UTF-16.
|
||||
//
|
||||
// The data may or may not be heap-allocated, and may or may not be reference counted. As a memory optimization, if the
|
||||
// UTF-16 string is entirely ASCII, the string is stored as 8-bit bytes.
|
||||
class [[nodiscard]] Utf16String : public Detail::Utf16StringBase {
|
||||
public:
|
||||
using Utf16StringBase::Utf16StringBase;
|
||||
|
||||
explicit constexpr Utf16String(Utf16StringBase&& base)
|
||||
: Utf16StringBase(move(base))
|
||||
{
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static Utf16String from_utf8(StringView utf8_string)
|
||||
{
|
||||
VERIFY(Utf8View { utf8_string }.validate());
|
||||
return from_utf8_without_validation(utf8_string);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static Utf16String from_utf8(String const& utf8_string)
|
||||
{
|
||||
return from_utf8_without_validation(utf8_string);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static ErrorOr<Utf16String> try_from_utf8(StringView utf8_string)
|
||||
{
|
||||
if (!Utf8View { utf8_string }.validate())
|
||||
return Error::from_string_literal("Input was not valid UTF-8");
|
||||
return from_utf8_without_validation(utf8_string);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static Utf16String from_utf16(Utf16View const& utf16_string)
|
||||
{
|
||||
VERIFY(utf16_string.validate());
|
||||
return from_utf16_without_validation(utf16_string);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static ErrorOr<Utf16String> try_from_utf16(Utf16View const& utf16_string)
|
||||
{
|
||||
if (!utf16_string.validate())
|
||||
return Error::from_string_literal("Input was not valid UTF-16");
|
||||
return from_utf16_without_validation(utf16_string);
|
||||
}
|
||||
|
||||
static Utf16String from_utf8_without_validation(StringView);
|
||||
static Utf16String from_utf16_without_validation(Utf16View const&);
|
||||
static Utf16String from_utf32(Utf32View const&);
|
||||
|
||||
template<typename T>
|
||||
requires(IsOneOf<RemoveCVReference<T>, Utf16String>)
|
||||
static Utf16String from_utf16(T&&) = delete;
|
||||
|
||||
template<typename T>
|
||||
requires(IsOneOf<RemoveCVReference<T>, Utf16String>)
|
||||
static ErrorOr<Utf16String> try_from_utf16(T&&) = delete;
|
||||
|
||||
template<typename T>
|
||||
requires(IsOneOf<RemoveCVReference<T>, Utf16String>)
|
||||
static Utf16String from_utf16_without_validation(T&&) = delete;
|
||||
|
||||
private:
|
||||
ALWAYS_INLINE explicit Utf16String(NonnullRefPtr<Detail::Utf16StringData const> value)
|
||||
: Utf16StringBase(move(value))
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct Formatter<Utf16String> : Formatter<FormatString> {
|
||||
ErrorOr<void> format(FormatBuilder&, Utf16String const&);
|
||||
};
|
||||
|
||||
template<>
|
||||
struct Traits<Utf16String> : public DefaultTraits<Utf16String> {
|
||||
static unsigned hash(Utf16String const& s) { return s.hash(); }
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE AK::Utf16String operator""_utf16(char const* string, size_t length)
|
||||
{
|
||||
AK::StringView view { string, length };
|
||||
|
||||
ASSERT(AK::Utf8View { view }.validate());
|
||||
return AK::Utf16String::from_utf8_without_validation(view);
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE AK::Utf16String operator""_utf16(char16_t const* string, size_t length)
|
||||
{
|
||||
AK::Utf16View view { string, length };
|
||||
|
||||
ASSERT(view.validate());
|
||||
return AK::Utf16String::from_utf16_without_validation(view);
|
||||
}
|
268
AK/Utf16StringBase.h
Normal file
268
AK/Utf16StringBase.h
Normal file
|
@ -0,0 +1,268 @@
|
|||
/*
|
||||
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/CharacterTypes.h>
|
||||
#include <AK/NonnullRefPtr.h>
|
||||
#include <AK/StringBase.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Utf16StringData.h>
|
||||
#include <AK/Utf16View.h>
|
||||
|
||||
namespace AK::Detail {
|
||||
|
||||
class Utf16StringBase {
|
||||
public:
|
||||
constexpr Utf16StringBase()
|
||||
: Utf16StringBase(ShortString::create_empty())
|
||||
{
|
||||
}
|
||||
|
||||
explicit constexpr Utf16StringBase(ShortString short_string)
|
||||
: m_value { .short_ascii_string = short_string }
|
||||
{
|
||||
}
|
||||
|
||||
ALWAYS_INLINE explicit Utf16StringBase(NonnullRefPtr<Utf16StringData const> value)
|
||||
: m_value { .data = &value.leak_ref() }
|
||||
{
|
||||
}
|
||||
|
||||
ALWAYS_INLINE Utf16StringBase(Utf16StringBase const& other)
|
||||
: m_value(other.m_value)
|
||||
{
|
||||
if (has_long_storage())
|
||||
data_without_union_member_assertion()->ref();
|
||||
}
|
||||
|
||||
constexpr Utf16StringBase(Utf16StringBase&& other)
|
||||
: m_value(other.m_value)
|
||||
{
|
||||
other.m_value = { .short_ascii_string = ShortString::create_empty() };
|
||||
}
|
||||
|
||||
constexpr ~Utf16StringBase()
|
||||
{
|
||||
if (!is_constant_evaluated())
|
||||
destroy_string();
|
||||
}
|
||||
|
||||
ALWAYS_INLINE operator Utf16View() const& { return utf16_view(); }
|
||||
explicit operator Utf16View() const&& = delete;
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE String to_utf8(AllowLonelySurrogates allow_lonely_surrogates = AllowLonelySurrogates::Yes) const
|
||||
{
|
||||
return MUST(utf16_view().to_utf8(allow_lonely_surrogates));
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE String to_utf8_but_should_be_ported_to_utf16(AllowLonelySurrogates allow_lonely_surrogates = AllowLonelySurrogates::Yes) const
|
||||
{
|
||||
return to_utf8(allow_lonely_surrogates);
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE ByteString to_byte_string(AllowLonelySurrogates allow_lonely_surrogates = AllowLonelySurrogates::Yes) const
|
||||
{
|
||||
return MUST(utf16_view().to_byte_string(allow_lonely_surrogates));
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE StringView ascii_view() const&
|
||||
{
|
||||
if (has_short_ascii_storage())
|
||||
return short_ascii_string_without_union_member_assertion().bytes();
|
||||
|
||||
VERIFY(has_long_ascii_storage());
|
||||
return data_without_union_member_assertion()->ascii_view();
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE Utf16View utf16_view() const&
|
||||
{
|
||||
if (has_short_ascii_storage())
|
||||
return Utf16View { ascii_view().characters_without_null_termination(), length_in_code_units() };
|
||||
return data_without_union_member_assertion()->utf16_view();
|
||||
}
|
||||
|
||||
StringView ascii_view() const&& = delete;
|
||||
Utf16View utf16_view() const&& = delete;
|
||||
|
||||
ALWAYS_INLINE Utf16StringBase& operator=(Utf16StringBase const& other)
|
||||
{
|
||||
if (&other != this) {
|
||||
if (has_long_storage())
|
||||
data_without_union_member_assertion()->unref();
|
||||
|
||||
m_value = other.m_value;
|
||||
|
||||
if (has_long_storage())
|
||||
data_without_union_member_assertion()->ref();
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
ALWAYS_INLINE Utf16StringBase& operator=(Utf16StringBase&& other)
|
||||
{
|
||||
if (has_long_storage())
|
||||
data_without_union_member_assertion()->unref();
|
||||
|
||||
m_value = exchange(other.m_value, { .short_ascii_string = ShortString::create_empty() });
|
||||
return *this;
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE bool operator==(Utf16StringBase const& other) const
|
||||
{
|
||||
if (has_short_ascii_storage() && other.has_short_ascii_storage())
|
||||
return bit_cast<FlatPtr>(m_value) == bit_cast<FlatPtr>(other.m_value);
|
||||
|
||||
if (has_long_storage() && other.has_long_storage())
|
||||
return *data_without_union_member_assertion() == *other.data_without_union_member_assertion();
|
||||
|
||||
return utf16_view() == other.utf16_view();
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE bool operator==(Utf16View const& other) const { return utf16_view() == other; }
|
||||
[[nodiscard]] ALWAYS_INLINE bool operator==(StringView other) const { return utf16_view() == other; }
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE bool equals_ignoring_ascii_case(Utf16View const& other) const { return utf16_view().equals_ignoring_ascii_case(other); }
|
||||
[[nodiscard]] ALWAYS_INLINE bool equals_ignoring_ascii_case(Utf16StringBase const& other) const { return utf16_view().equals_ignoring_ascii_case(other.utf16_view()); }
|
||||
|
||||
template<typename... Ts>
|
||||
[[nodiscard]] ALWAYS_INLINE bool is_one_of(Ts&&... strings) const
|
||||
{
|
||||
return (this->operator==(forward<Ts>(strings)) || ...);
|
||||
}
|
||||
|
||||
template<typename... Ts>
|
||||
[[nodiscard]] ALWAYS_INLINE bool is_one_of_ignoring_ascii_case(Ts&&... strings) const
|
||||
{
|
||||
return (this->equals_ignoring_ascii_case(forward<Ts>(strings)) || ...);
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE u32 hash() const
|
||||
{
|
||||
if (has_short_ascii_storage())
|
||||
return StringView { short_ascii_string_without_union_member_assertion().bytes() }.hash();
|
||||
return data_without_union_member_assertion()->hash();
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE bool is_empty() const { return length_in_code_units() == 0uz; }
|
||||
[[nodiscard]] ALWAYS_INLINE bool is_ascii() const { return utf16_view().is_ascii(); }
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE size_t length_in_code_units() const
|
||||
{
|
||||
if (has_short_ascii_storage())
|
||||
return short_ascii_string_without_union_member_assertion().byte_count();
|
||||
return data_without_union_member_assertion()->length_in_code_units();
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE size_t length_in_code_points() const
|
||||
{
|
||||
if (has_short_ascii_storage())
|
||||
return short_ascii_string_without_union_member_assertion().byte_count();
|
||||
return data_without_union_member_assertion()->length_in_code_points();
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE char16_t code_unit_at(size_t code_unit_offset) const { return utf16_view().code_unit_at(code_unit_offset); }
|
||||
[[nodiscard]] ALWAYS_INLINE u32 code_point_at(size_t code_unit_offset) const { return utf16_view().code_point_at(code_unit_offset); }
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE size_t code_unit_offset_of(size_t code_point_offset) const
|
||||
{
|
||||
if (has_ascii_storage())
|
||||
return code_point_offset;
|
||||
return utf16_view().code_unit_offset_of(code_point_offset);
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE size_t code_point_offset_of(size_t code_unit_offset) const
|
||||
{
|
||||
if (has_ascii_storage())
|
||||
return code_unit_offset;
|
||||
return utf16_view().code_point_offset_of(code_unit_offset);
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE Utf16CodePointIterator begin() const { return utf16_view().begin(); }
|
||||
[[nodiscard]] ALWAYS_INLINE Utf16CodePointIterator end() const { return utf16_view().end(); }
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const
|
||||
{
|
||||
return utf16_view().substring_view(code_unit_offset, code_unit_length);
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE Utf16View substring_view(size_t code_unit_offset) const
|
||||
{
|
||||
return utf16_view().substring_view(code_unit_offset);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE Optional<size_t> find_code_unit_offset(char16_t needle, size_t start_offset = 0) const
|
||||
{
|
||||
return utf16_view().find_code_unit_offset(needle, start_offset);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE Optional<size_t> find_code_unit_offset(Utf16View const& needle, size_t start_offset = 0) const
|
||||
{
|
||||
return utf16_view().find_code_unit_offset(needle, start_offset);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE Optional<size_t> find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset = 0) const
|
||||
{
|
||||
return utf16_view().find_code_unit_offset_ignoring_case(needle, start_offset);
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE bool starts_with(Utf16View const& needle) const { return utf16_view().starts_with(needle); }
|
||||
|
||||
// This is primarily interesting to unit tests.
|
||||
[[nodiscard]] constexpr bool has_short_ascii_storage() const
|
||||
{
|
||||
if (is_constant_evaluated())
|
||||
return (m_value.short_ascii_string.byte_count_and_short_string_flag & StringBase::SHORT_STRING_FLAG) != 0;
|
||||
return (short_ascii_string_without_union_member_assertion().byte_count_and_short_string_flag & StringBase::SHORT_STRING_FLAG) != 0;
|
||||
}
|
||||
|
||||
// This is primarily interesting to unit tests.
|
||||
[[nodiscard]] ALWAYS_INLINE bool has_long_ascii_storage() const
|
||||
{
|
||||
if (has_short_ascii_storage())
|
||||
return false;
|
||||
return data_without_union_member_assertion()->has_ascii_storage();
|
||||
}
|
||||
|
||||
// This is primarily interesting to unit tests.
|
||||
[[nodiscard]] ALWAYS_INLINE bool has_ascii_storage() const
|
||||
{
|
||||
return has_short_ascii_storage() || has_long_ascii_storage();
|
||||
}
|
||||
|
||||
// This is primarily interesting to unit tests.
|
||||
[[nodiscard]] ALWAYS_INLINE bool has_long_utf16_storage() const
|
||||
{
|
||||
if (has_short_ascii_storage())
|
||||
return false;
|
||||
return data_without_union_member_assertion()->has_utf16_storage();
|
||||
}
|
||||
|
||||
// This is primarily interesting to unit tests.
|
||||
[[nodiscard]] ALWAYS_INLINE bool has_long_storage() const
|
||||
{
|
||||
return !has_short_ascii_storage();
|
||||
}
|
||||
|
||||
protected:
|
||||
ALWAYS_INLINE void destroy_string() const
|
||||
{
|
||||
if (has_long_storage())
|
||||
data_without_union_member_assertion()->unref();
|
||||
}
|
||||
|
||||
// This is technically **invalid**! See StringBase for details.
|
||||
ALWAYS_INLINE ShortString const& short_ascii_string_without_union_member_assertion() const { return *__builtin_launder(&m_value.short_ascii_string); }
|
||||
ALWAYS_INLINE Utf16StringData const* data_without_union_member_assertion() const { return *__builtin_launder(&m_value.data); }
|
||||
|
||||
union {
|
||||
ShortString short_ascii_string;
|
||||
Utf16StringData const* data;
|
||||
} m_value;
|
||||
};
|
||||
|
||||
}
|
148
AK/Utf16StringData.cpp
Normal file
148
AK/Utf16StringData.cpp
Normal file
|
@ -0,0 +1,148 @@
|
|||
/*
|
||||
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <AK/TypedTransfer.h>
|
||||
#include <AK/Utf16StringData.h>
|
||||
#include <AK/Utf32View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
|
||||
#include <simdutf.h>
|
||||
|
||||
namespace AK::Detail {
|
||||
|
||||
// Due to internal optimizations, we have an explicit maximum string length of 2**63 - 1.
|
||||
#define VERIFY_UTF16_LENGTH(length) VERIFY(length >> Detail::UTF16_FLAG == 0);
|
||||
|
||||
NonnullRefPtr<Utf16StringData> Utf16StringData::create_uninitialized(StorageType storage_type, size_t code_unit_length)
|
||||
{
|
||||
auto allocation_size = storage_type == Utf16StringData::StorageType::ASCII
|
||||
? sizeof(Utf16StringData) + (sizeof(char) * code_unit_length)
|
||||
: sizeof(Utf16StringData) + (sizeof(char16_t) * code_unit_length);
|
||||
|
||||
void* slot = malloc(allocation_size);
|
||||
VERIFY(slot);
|
||||
|
||||
return adopt_ref(*new (slot) Utf16StringData(storage_type, code_unit_length));
|
||||
}
|
||||
|
||||
template<typename ViewType>
|
||||
NonnullRefPtr<Utf16StringData> Utf16StringData::create_from_code_point_iterable(ViewType const& view)
|
||||
{
|
||||
size_t code_unit_length = 0;
|
||||
size_t code_point_length = 0;
|
||||
|
||||
for (auto code_point : view) {
|
||||
code_unit_length += UnicodeUtils::code_unit_length_for_code_point(code_point);
|
||||
++code_point_length;
|
||||
}
|
||||
|
||||
VERIFY_UTF16_LENGTH(code_unit_length);
|
||||
|
||||
auto string = create_uninitialized(StorageType::UTF16, code_unit_length);
|
||||
string->m_length_in_code_points = code_point_length;
|
||||
|
||||
size_t code_unit_index = 0;
|
||||
|
||||
for (auto code_point : view) {
|
||||
(void)UnicodeUtils::code_point_to_utf16(code_point, [&](auto code_unit) {
|
||||
string->m_utf16_data[code_unit_index++] = code_unit;
|
||||
});
|
||||
}
|
||||
|
||||
return string;
|
||||
}
|
||||
|
||||
NonnullRefPtr<Utf16StringData> Utf16StringData::from_utf8(StringView utf8_string, AllowASCIIStorage allow_ascii_storage)
|
||||
{
|
||||
RefPtr<Utf16StringData> string;
|
||||
|
||||
if (allow_ascii_storage == AllowASCIIStorage::Yes && utf8_string.is_ascii()) {
|
||||
VERIFY_UTF16_LENGTH(utf8_string.length());
|
||||
|
||||
string = create_uninitialized(StorageType::ASCII, utf8_string.length());
|
||||
TypedTransfer<char>::copy(string->m_ascii_data, utf8_string.characters_without_null_termination(), utf8_string.length());
|
||||
} else if (Utf8View view { utf8_string }; view.validate(AllowLonelySurrogates::No)) {
|
||||
auto code_unit_length = simdutf::utf16_length_from_utf8(utf8_string.characters_without_null_termination(), utf8_string.length());
|
||||
VERIFY_UTF16_LENGTH(code_unit_length);
|
||||
|
||||
string = create_uninitialized(StorageType::UTF16, code_unit_length);
|
||||
|
||||
auto result = simdutf::convert_utf8_to_utf16(utf8_string.characters_without_null_termination(), utf8_string.length(), string->m_utf16_data);
|
||||
VERIFY(result == code_unit_length);
|
||||
} else {
|
||||
string = create_from_code_point_iterable(view);
|
||||
}
|
||||
|
||||
return string.release_nonnull();
|
||||
}
|
||||
|
||||
NonnullRefPtr<Utf16StringData> Utf16StringData::from_utf16(Utf16View const& utf16_string)
|
||||
{
|
||||
VERIFY_UTF16_LENGTH(utf16_string.length_in_code_units());
|
||||
RefPtr<Utf16StringData> string;
|
||||
|
||||
if (utf16_string.has_ascii_storage()) {
|
||||
string = create_uninitialized(StorageType::ASCII, utf16_string.length_in_code_units());
|
||||
TypedTransfer<char>::copy(string->m_ascii_data, utf16_string.ascii_span().data(), utf16_string.length_in_code_units());
|
||||
} else if (utf16_string.is_ascii()) {
|
||||
string = create_uninitialized(StorageType::ASCII, utf16_string.length_in_code_units());
|
||||
|
||||
auto result = simdutf::convert_utf16_to_utf8(utf16_string.utf16_span().data(), utf16_string.length_in_code_units(), string->m_ascii_data);
|
||||
VERIFY(result == utf16_string.length_in_code_units());
|
||||
} else {
|
||||
string = create_uninitialized(StorageType::UTF16, utf16_string.length_in_code_units());
|
||||
TypedTransfer<char16_t>::copy(string->m_utf16_data, utf16_string.utf16_span().data(), utf16_string.length_in_code_units());
|
||||
|
||||
string->m_length_in_code_points = utf16_string.m_length_in_code_points;
|
||||
}
|
||||
|
||||
return string.release_nonnull();
|
||||
}
|
||||
|
||||
NonnullRefPtr<Utf16StringData> Utf16StringData::from_utf32(Utf32View const& utf32_string)
|
||||
{
|
||||
RefPtr<Utf16StringData> string;
|
||||
|
||||
auto const* utf32_data = reinterpret_cast<char32_t const*>(utf32_string.code_points());
|
||||
auto utf32_length = utf32_string.length();
|
||||
|
||||
if (utf32_string.is_ascii()) {
|
||||
VERIFY_UTF16_LENGTH(utf32_length);
|
||||
|
||||
string = create_uninitialized(StorageType::ASCII, utf32_length);
|
||||
|
||||
auto result = simdutf::convert_utf32_to_utf8(utf32_data, utf32_length, string->m_ascii_data);
|
||||
VERIFY(result == utf32_length);
|
||||
} else if (simdutf::validate_utf32(utf32_data, utf32_length)) {
|
||||
auto code_unit_length = simdutf::utf16_length_from_utf32(utf32_data, utf32_length);
|
||||
VERIFY_UTF16_LENGTH(code_unit_length);
|
||||
|
||||
string = create_uninitialized(StorageType::UTF16, code_unit_length);
|
||||
string->m_length_in_code_points = utf32_length;
|
||||
|
||||
auto result = simdutf::convert_utf32_to_utf16(utf32_data, utf32_length, string->m_utf16_data);
|
||||
VERIFY(result == code_unit_length);
|
||||
} else {
|
||||
string = create_from_code_point_iterable(utf32_string);
|
||||
}
|
||||
|
||||
return string.release_nonnull();
|
||||
}
|
||||
|
||||
size_t Utf16StringData::calculate_code_point_length() const
|
||||
{
|
||||
ASSERT(!has_ascii_storage());
|
||||
|
||||
if (simdutf::validate_utf16(m_utf16_data, length_in_code_units()))
|
||||
return simdutf::count_utf16(m_utf16_data, length_in_code_units());
|
||||
|
||||
size_t code_points = 0;
|
||||
for ([[maybe_unused]] auto code_point : utf16_view())
|
||||
++code_points;
|
||||
return code_points;
|
||||
}
|
||||
|
||||
}
|
133
AK/Utf16StringData.h
Normal file
133
AK/Utf16StringData.h
Normal file
|
@ -0,0 +1,133 @@
|
|||
/*
|
||||
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/NonnullRefPtr.h>
|
||||
#include <AK/NumericLimits.h>
|
||||
#include <AK/RefCounted.h>
|
||||
#include <AK/Span.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Types.h>
|
||||
#include <AK/Utf16View.h>
|
||||
|
||||
namespace AK::Detail {
|
||||
|
||||
class Utf16StringData final : public RefCounted<Utf16StringData> {
|
||||
public:
|
||||
enum class StorageType : u8 {
|
||||
ASCII,
|
||||
UTF16,
|
||||
};
|
||||
|
||||
enum class AllowASCIIStorage : u8 {
|
||||
No,
|
||||
Yes,
|
||||
};
|
||||
|
||||
static NonnullRefPtr<Utf16StringData> from_utf8(StringView, AllowASCIIStorage);
|
||||
static NonnullRefPtr<Utf16StringData> from_utf16(Utf16View const&);
|
||||
static NonnullRefPtr<Utf16StringData> from_utf32(Utf32View const&);
|
||||
|
||||
~Utf16StringData() = default;
|
||||
|
||||
void operator delete(void* ptr)
|
||||
{
|
||||
free(ptr);
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE bool operator==(Utf16StringData const& other) const
|
||||
{
|
||||
return utf16_view() == other.utf16_view();
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE bool operator==(Utf16View const& other) const
|
||||
{
|
||||
return utf16_view() == other;
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE bool operator==(StringView const& other) const
|
||||
{
|
||||
if (has_ascii_storage())
|
||||
return ascii_view() == other;
|
||||
return utf16_view() == Utf16View { other.characters_without_null_termination(), other.length() };
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE bool has_ascii_storage() const { return m_length_in_code_units >> Detail::UTF16_FLAG == 0; }
|
||||
[[nodiscard]] ALWAYS_INLINE bool has_utf16_storage() const { return m_length_in_code_units >> Detail::UTF16_FLAG != 0; }
|
||||
|
||||
ALWAYS_INLINE u32 hash() const
|
||||
{
|
||||
if (!m_has_hash)
|
||||
m_hash = calculate_hash();
|
||||
return m_hash;
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE size_t length_in_code_units() const { return m_length_in_code_units & ~(1uz << Detail::UTF16_FLAG); }
|
||||
[[nodiscard]] ALWAYS_INLINE size_t length_in_code_points() const
|
||||
{
|
||||
if (has_ascii_storage())
|
||||
return length_in_code_units();
|
||||
if (m_length_in_code_points == NumericLimits<size_t>::max())
|
||||
m_length_in_code_points = calculate_code_point_length();
|
||||
return m_length_in_code_points;
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE StringView ascii_view() const
|
||||
{
|
||||
ASSERT(has_ascii_storage());
|
||||
return { m_ascii_data, length_in_code_units() };
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE Utf16View utf16_view() const
|
||||
{
|
||||
if (has_ascii_storage())
|
||||
return { m_ascii_data, length_in_code_units() };
|
||||
|
||||
Utf16View view { m_utf16_data, length_in_code_units() };
|
||||
view.m_length_in_code_points = m_length_in_code_points;
|
||||
|
||||
return view;
|
||||
}
|
||||
|
||||
private:
|
||||
ALWAYS_INLINE Utf16StringData(StorageType storage_type, size_t code_unit_length)
|
||||
: m_length_in_code_units(code_unit_length)
|
||||
{
|
||||
if (storage_type == StorageType::UTF16)
|
||||
m_length_in_code_units |= 1uz << Detail::UTF16_FLAG;
|
||||
}
|
||||
|
||||
static NonnullRefPtr<Utf16StringData> create_uninitialized(StorageType storage_type, size_t code_unit_length);
|
||||
|
||||
template<typename ViewType>
|
||||
static NonnullRefPtr<Utf16StringData> create_from_code_point_iterable(ViewType const&);
|
||||
|
||||
[[nodiscard]] size_t calculate_code_point_length() const;
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE u32 calculate_hash() const
|
||||
{
|
||||
if (has_ascii_storage())
|
||||
return ascii_view().hash();
|
||||
return utf16_view().hash();
|
||||
}
|
||||
|
||||
// We store whether this string has ASCII or UTF-16 storage by setting the most significant bit of m_length_in_code_units
|
||||
// to 1 for UTF-16 storage. This shrinks the size of most UTF-16 string related classes, at the cost of not being
|
||||
// allowed to create a string larger than 2**63 - 1.
|
||||
size_t m_length_in_code_units { 0 };
|
||||
mutable size_t m_length_in_code_points { NumericLimits<size_t>::max() };
|
||||
|
||||
mutable u32 m_hash { 0 };
|
||||
mutable bool m_has_hash { false };
|
||||
|
||||
union {
|
||||
char m_ascii_data[0];
|
||||
char16_t m_utf16_data[0];
|
||||
};
|
||||
};
|
||||
|
||||
}
|
|
@ -99,15 +99,19 @@ ErrorOr<String> Utf16View::to_utf8(AllowLonelySurrogates allow_lonely_surrogates
|
|||
{
|
||||
if (is_empty())
|
||||
return String {};
|
||||
if (has_ascii_storage())
|
||||
return String::from_utf8_without_validation(bytes());
|
||||
|
||||
if (!validate(allow_lonely_surrogates))
|
||||
return Error::from_string_literal("Input was not valid UTF-16");
|
||||
|
||||
if (allow_lonely_surrogates == AllowLonelySurrogates::No) {
|
||||
String result;
|
||||
auto utf8_length = simdutf::utf8_length_from_utf16(m_string, length_in_code_units());
|
||||
|
||||
auto utf8_length = simdutf::utf8_length_from_utf16(m_string.utf16, length_in_code_units());
|
||||
|
||||
TRY(result.replace_with_new_string(Badge<Utf16View> {}, utf8_length, [&](Bytes buffer) -> ErrorOr<void> {
|
||||
[[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(m_string, length_in_code_units(), reinterpret_cast<char*>(buffer.data()));
|
||||
[[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(m_string.utf16, length_in_code_units(), reinterpret_cast<char*>(buffer.data()));
|
||||
ASSERT(result == buffer.size());
|
||||
return {};
|
||||
}));
|
||||
|
@ -127,17 +131,25 @@ ErrorOr<ByteString> Utf16View::to_byte_string(AllowLonelySurrogates allow_lonely
|
|||
|
||||
bool Utf16View::is_ascii() const
|
||||
{
|
||||
if (has_ascii_storage())
|
||||
return true;
|
||||
|
||||
// FIXME: Petition simdutf to implement an ASCII validator for UTF-16.
|
||||
return all_of(span(), AK::is_ascii);
|
||||
return all_of(utf16_span(), AK::is_ascii);
|
||||
}
|
||||
|
||||
bool Utf16View::validate(size_t& valid_code_units, AllowLonelySurrogates allow_lonely_surrogates) const
|
||||
{
|
||||
if (has_ascii_storage()) {
|
||||
valid_code_units = length_in_code_units();
|
||||
return true;
|
||||
}
|
||||
|
||||
auto view = *this;
|
||||
valid_code_units = 0;
|
||||
|
||||
while (!view.is_empty()) {
|
||||
auto result = simdutf::validate_utf16_with_errors(view.m_string, view.length_in_code_units());
|
||||
auto result = simdutf::validate_utf16_with_errors(view.m_string.utf16, view.length_in_code_units());
|
||||
valid_code_units += result.count;
|
||||
|
||||
if (result.error == simdutf::SUCCESS)
|
||||
|
@ -197,7 +209,9 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
|
|||
return substring_view(code_point_offset, code_point_length);
|
||||
|
||||
auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) {
|
||||
return it.m_iterator - m_string;
|
||||
if (has_ascii_storage())
|
||||
return it.m_iterator.ascii - m_string.ascii;
|
||||
return it.m_iterator.utf16 - m_string.utf16;
|
||||
};
|
||||
|
||||
size_t code_point_index = 0;
|
||||
|
@ -220,9 +234,11 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
|
|||
|
||||
size_t Utf16View::calculate_length_in_code_points() const
|
||||
{
|
||||
ASSERT(!has_ascii_storage());
|
||||
|
||||
// simdutf's code point length method assumes valid UTF-16, whereas we allow lonely surrogates.
|
||||
if (validate(AllowLonelySurrogates::No)) [[likely]]
|
||||
return simdutf::count_utf16(m_string, length_in_code_units());
|
||||
return simdutf::count_utf16(m_string.utf16, length_in_code_units());
|
||||
|
||||
size_t code_points = 0;
|
||||
for ([[maybe_unused]] auto code_point : *this)
|
||||
|
|
245
AK/Utf16View.h
245
AK/Utf16View.h
|
@ -37,6 +37,13 @@ ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const&);
|
|||
|
||||
size_t utf16_code_unit_length_from_utf8(StringView);
|
||||
|
||||
namespace Detail {
|
||||
|
||||
static constexpr inline auto UTF16_FLAG = NumericLimits<size_t>::digits() - 1;
|
||||
class Utf16StringBase;
|
||||
|
||||
}
|
||||
|
||||
class Utf16CodePointIterator {
|
||||
friend class Utf16View;
|
||||
|
||||
|
@ -46,23 +53,35 @@ public:
|
|||
|
||||
constexpr Utf16CodePointIterator& operator++()
|
||||
{
|
||||
VERIFY(m_remaining_code_units > 0);
|
||||
auto remaining_code_units = this->remaining_code_units();
|
||||
VERIFY(remaining_code_units > 0);
|
||||
|
||||
auto length = min(length_in_code_units(), m_remaining_code_units);
|
||||
m_iterator += length;
|
||||
if (has_ascii_storage()) {
|
||||
++m_iterator.ascii;
|
||||
--m_remaining_code_units;
|
||||
} else {
|
||||
auto length = min(length_in_code_units(), remaining_code_units);
|
||||
|
||||
m_iterator.utf16 += length;
|
||||
m_remaining_code_units -= length;
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
constexpr u32 operator*() const
|
||||
{
|
||||
VERIFY(m_remaining_code_units > 0);
|
||||
auto code_unit = *m_iterator;
|
||||
auto remaining_code_units = this->remaining_code_units();
|
||||
VERIFY(remaining_code_units > 0);
|
||||
|
||||
if (has_ascii_storage())
|
||||
return *m_iterator.ascii;
|
||||
|
||||
auto code_unit = *m_iterator.utf16;
|
||||
|
||||
if (UnicodeUtils::is_utf16_high_surrogate(code_unit)) {
|
||||
if (m_remaining_code_units > 1) {
|
||||
auto next_code_unit = *(m_iterator + 1);
|
||||
if (remaining_code_units > 1) {
|
||||
auto next_code_unit = *(m_iterator.utf16 + 1);
|
||||
|
||||
if (UnicodeUtils::is_utf16_low_surrogate(next_code_unit))
|
||||
return UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit);
|
||||
|
@ -79,22 +98,46 @@ public:
|
|||
|
||||
[[nodiscard]] constexpr bool operator==(Utf16CodePointIterator const& other) const
|
||||
{
|
||||
return (m_iterator == other.m_iterator) && (m_remaining_code_units == other.m_remaining_code_units);
|
||||
// Note that this also protects against iterators with different underlying storage.
|
||||
if (m_remaining_code_units != other.m_remaining_code_units)
|
||||
return false;
|
||||
|
||||
if (has_ascii_storage())
|
||||
return m_iterator.ascii == other.m_iterator.ascii;
|
||||
return m_iterator.utf16 == other.m_iterator.utf16;
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr size_t length_in_code_units() const
|
||||
[[nodiscard]] ALWAYS_INLINE size_t length_in_code_units()
|
||||
{
|
||||
if (has_ascii_storage())
|
||||
return 1;
|
||||
return UnicodeUtils::code_unit_length_for_code_point(**this);
|
||||
}
|
||||
|
||||
private:
|
||||
constexpr Utf16CodePointIterator(char16_t const* ptr, size_t length)
|
||||
: m_iterator(ptr)
|
||||
constexpr Utf16CodePointIterator(char const* iterator, size_t length)
|
||||
: m_iterator { .ascii = iterator }
|
||||
, m_remaining_code_units(length)
|
||||
{
|
||||
}
|
||||
|
||||
char16_t const* m_iterator { nullptr };
|
||||
constexpr Utf16CodePointIterator(char16_t const* iterator, size_t length)
|
||||
: m_iterator { .utf16 = iterator }
|
||||
, m_remaining_code_units(length)
|
||||
{
|
||||
m_remaining_code_units |= 1uz << Detail::UTF16_FLAG;
|
||||
}
|
||||
|
||||
constexpr bool has_ascii_storage() const { return m_remaining_code_units >> Detail::UTF16_FLAG == 0; }
|
||||
constexpr size_t remaining_code_units() const { return m_remaining_code_units & ~(1uz << Detail::UTF16_FLAG); }
|
||||
|
||||
union {
|
||||
char const* ascii;
|
||||
char16_t const* utf16;
|
||||
} m_iterator { .ascii = nullptr };
|
||||
|
||||
// Just like Utf16StringData, we store whether this string has ASCII or UTF-16 storage by setting the most
|
||||
// significant bit of m_remaining_code_units for UTF-16 storage.
|
||||
size_t m_remaining_code_units { 0 };
|
||||
};
|
||||
|
||||
|
@ -106,38 +149,86 @@ public:
|
|||
~Utf16View() = default;
|
||||
|
||||
constexpr Utf16View(char16_t const* string, size_t length_in_code_units)
|
||||
: m_string(string)
|
||||
: m_string { .utf16 = string }
|
||||
, m_length_in_code_units(length_in_code_units)
|
||||
{
|
||||
m_length_in_code_units |= 1uz << Detail::UTF16_FLAG;
|
||||
}
|
||||
|
||||
constexpr Utf16View(Utf16Data const& string)
|
||||
: m_string(string.data())
|
||||
: m_string { .utf16 = string.data() }
|
||||
, m_length_in_code_units(string.size())
|
||||
{
|
||||
m_length_in_code_units |= 1uz << Detail::UTF16_FLAG;
|
||||
}
|
||||
|
||||
consteval Utf16View(StringView string)
|
||||
: m_string { .ascii = string.characters_without_null_termination() }
|
||||
, m_length_in_code_units(string.length())
|
||||
{
|
||||
VERIFY(all_of(string, AK::is_ascii));
|
||||
}
|
||||
|
||||
Utf16View(Utf16ConversionResult&&) = delete;
|
||||
explicit Utf16View(Utf16ConversionResult const& conversion_result)
|
||||
: m_string(conversion_result.data.data())
|
||||
: m_string { .utf16 = conversion_result.data.data() }
|
||||
, m_length_in_code_units(conversion_result.data.size())
|
||||
, m_length_in_code_points(conversion_result.code_point_count)
|
||||
{
|
||||
m_length_in_code_units |= 1uz << Detail::UTF16_FLAG;
|
||||
}
|
||||
|
||||
ErrorOr<String> to_utf8(AllowLonelySurrogates = AllowLonelySurrogates::Yes) const;
|
||||
ErrorOr<ByteString> to_byte_string(AllowLonelySurrogates = AllowLonelySurrogates::Yes) const;
|
||||
|
||||
[[nodiscard]] constexpr ReadonlySpan<char16_t> span() const
|
||||
ALWAYS_INLINE String to_utf8_but_should_be_ported_to_utf16(AllowLonelySurrogates allow_lonely_surrogates = AllowLonelySurrogates::Yes) const
|
||||
{
|
||||
return { m_string, length_in_code_units() };
|
||||
return MUST(to_utf8(allow_lonely_surrogates));
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE bool has_ascii_storage() const { return m_length_in_code_units >> Detail::UTF16_FLAG == 0; }
|
||||
|
||||
[[nodiscard]] constexpr ReadonlyBytes bytes() const
|
||||
{
|
||||
VERIFY(has_ascii_storage());
|
||||
return { m_string.ascii, length_in_code_units() };
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr ReadonlySpan<char> ascii_span() const
|
||||
{
|
||||
VERIFY(has_ascii_storage());
|
||||
return { m_string.ascii, length_in_code_units() };
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr ReadonlySpan<char16_t> utf16_span() const
|
||||
{
|
||||
VERIFY(!has_ascii_storage());
|
||||
return { m_string.utf16, length_in_code_units() };
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr bool operator==(Utf16View const& other) const
|
||||
{
|
||||
if (length_in_code_units() != other.length_in_code_units())
|
||||
return false;
|
||||
return TypedTransfer<char16_t>::compare(m_string, other.m_string, length_in_code_units());
|
||||
|
||||
if (has_ascii_storage() && other.has_ascii_storage())
|
||||
return TypedTransfer<char>::compare(m_string.ascii, other.m_string.ascii, length_in_code_units());
|
||||
if (!has_ascii_storage() && !other.has_ascii_storage())
|
||||
return TypedTransfer<char16_t>::compare(m_string.utf16, other.m_string.utf16, length_in_code_units());
|
||||
|
||||
for (size_t i = 0; i < length_in_code_units(); ++i) {
|
||||
if (code_unit_at(i) != other.code_unit_at(i))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr bool operator==(StringView other) const
|
||||
{
|
||||
if (has_ascii_storage())
|
||||
return bytes() == other.bytes();
|
||||
return *this == Utf16View { other.characters_without_null_termination(), other.length() };
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr bool equals_ignoring_case(Utf16View const& other) const
|
||||
|
@ -175,10 +266,18 @@ public:
|
|||
{
|
||||
if (is_empty())
|
||||
return 0;
|
||||
return string_hash(reinterpret_cast<char const*>(m_string), length_in_code_units() * sizeof(char16_t));
|
||||
if (has_ascii_storage())
|
||||
return string_hash(m_string.ascii, length_in_code_units());
|
||||
return string_hash(reinterpret_cast<char const*>(m_string.utf16), length_in_code_units() * sizeof(char16_t));
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr bool is_null() const
|
||||
{
|
||||
if (has_ascii_storage())
|
||||
return m_string.ascii == nullptr;
|
||||
return m_string.utf16 == nullptr;
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr bool is_null() const { return m_string == nullptr; }
|
||||
[[nodiscard]] constexpr bool is_empty() const { return length_in_code_units() == 0; }
|
||||
[[nodiscard]] bool is_ascii() const;
|
||||
|
||||
|
@ -190,10 +289,13 @@ public:
|
|||
|
||||
[[nodiscard]] bool validate(size_t& valid_code_units, AllowLonelySurrogates = AllowLonelySurrogates::Yes) const;
|
||||
|
||||
[[nodiscard]] constexpr size_t length_in_code_units() const { return m_length_in_code_units; }
|
||||
[[nodiscard]] constexpr size_t length_in_code_units() const { return m_length_in_code_units & ~(1uz << Detail::UTF16_FLAG); }
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE size_t length_in_code_points() const
|
||||
{
|
||||
if (has_ascii_storage())
|
||||
return m_length_in_code_units;
|
||||
|
||||
if (m_length_in_code_points == NumericLimits<size_t>::max())
|
||||
m_length_in_code_points = calculate_length_in_code_points();
|
||||
return m_length_in_code_points;
|
||||
|
@ -201,6 +303,9 @@ public:
|
|||
|
||||
constexpr Optional<size_t> length_in_code_points_if_known() const
|
||||
{
|
||||
if (has_ascii_storage())
|
||||
return m_length_in_code_units;
|
||||
|
||||
if (m_length_in_code_points == NumericLimits<size_t>::max())
|
||||
return {};
|
||||
return m_length_in_code_points;
|
||||
|
@ -211,7 +316,10 @@ public:
|
|||
[[nodiscard]] constexpr char16_t code_unit_at(size_t index) const
|
||||
{
|
||||
VERIFY(index < length_in_code_units());
|
||||
return m_string[index];
|
||||
|
||||
if (has_ascii_storage())
|
||||
return m_string.ascii[index];
|
||||
return m_string.utf16[index];
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr u32 code_point_at(size_t index) const
|
||||
|
@ -236,18 +344,25 @@ public:
|
|||
|
||||
[[nodiscard]] constexpr Utf16CodePointIterator begin() const
|
||||
{
|
||||
return { m_string, length_in_code_units() };
|
||||
if (has_ascii_storage())
|
||||
return { m_string.ascii, length_in_code_units() };
|
||||
return { m_string.utf16, length_in_code_units() };
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr Utf16CodePointIterator end() const
|
||||
{
|
||||
return { m_string + length_in_code_units(), 0 };
|
||||
if (has_ascii_storage())
|
||||
return { m_string.ascii + length_in_code_units(), 0 };
|
||||
return { m_string.utf16 + length_in_code_units(), 0 };
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const
|
||||
{
|
||||
VERIFY(code_unit_offset + code_unit_length <= length_in_code_units());
|
||||
return { m_string + code_unit_offset, code_unit_length };
|
||||
|
||||
if (has_ascii_storage())
|
||||
return { m_string.ascii + code_unit_offset, code_unit_length };
|
||||
return { m_string.utf16 + code_unit_offset, code_unit_length };
|
||||
}
|
||||
|
||||
[[nodiscard]] constexpr Utf16View substring_view(size_t code_unit_offset) const { return substring_view(code_unit_offset, length_in_code_units() - code_unit_offset); }
|
||||
|
@ -259,12 +374,42 @@ public:
|
|||
{
|
||||
if (start_offset >= length_in_code_units())
|
||||
return {};
|
||||
return AK::memmem_optional(m_string + start_offset, (length_in_code_units() - start_offset) * sizeof(char16_t), &needle, sizeof(needle));
|
||||
|
||||
if (has_ascii_storage()) {
|
||||
if (!AK::is_ascii(needle))
|
||||
return false;
|
||||
|
||||
auto byte = static_cast<char>(needle);
|
||||
return AK::memmem_optional(m_string.ascii + start_offset, length_in_code_units() - start_offset, &byte, sizeof(byte));
|
||||
}
|
||||
|
||||
return AK::memmem_optional(m_string.utf16 + start_offset, (length_in_code_units() - start_offset) * sizeof(char16_t), &needle, sizeof(needle));
|
||||
}
|
||||
|
||||
constexpr Optional<size_t> find_code_unit_offset(Utf16View const& needle, size_t start_offset = 0) const
|
||||
{
|
||||
return span().index_of(needle.span(), start_offset);
|
||||
if (has_ascii_storage() && needle.has_ascii_storage())
|
||||
return ascii_span().index_of(needle.ascii_span(), start_offset);
|
||||
if (!has_ascii_storage() && !needle.has_ascii_storage())
|
||||
return utf16_span().index_of(needle.utf16_span(), start_offset);
|
||||
|
||||
Checked maximum_offset { start_offset };
|
||||
maximum_offset += needle.length_in_code_units();
|
||||
if (maximum_offset.has_overflow() || maximum_offset.value() > length_in_code_units())
|
||||
return {};
|
||||
|
||||
if (needle.is_empty())
|
||||
return start_offset;
|
||||
|
||||
for (size_t index = start_offset; index <= length_in_code_units() - needle.length_in_code_units();) {
|
||||
auto slice = substring_view(index, needle.length_in_code_units());
|
||||
if (slice == needle)
|
||||
return index;
|
||||
|
||||
index += slice.begin().length_in_code_units();
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
constexpr Optional<size_t> find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset = 0) const
|
||||
|
@ -298,9 +443,24 @@ public:
|
|||
if (needle.length_in_code_units() > length_in_code_units())
|
||||
return false;
|
||||
|
||||
if (m_string == needle.m_string)
|
||||
if (has_ascii_storage() && needle.has_ascii_storage()) {
|
||||
if (m_string.ascii == needle.m_string.ascii)
|
||||
return true;
|
||||
return ascii_span().starts_with(needle.ascii_span());
|
||||
}
|
||||
|
||||
if (!has_ascii_storage() && !needle.has_ascii_storage()) {
|
||||
if (m_string.utf16 == needle.m_string.utf16)
|
||||
return true;
|
||||
return utf16_span().starts_with(needle.utf16_span());
|
||||
}
|
||||
|
||||
for (auto this_it = begin(), needle_it = needle.begin(); needle_it != needle.end(); ++needle_it, ++this_it) {
|
||||
if (*this_it != *needle_it)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
return span().starts_with(needle.span());
|
||||
}
|
||||
|
||||
// https://infra.spec.whatwg.org/#code-unit-less-than
|
||||
|
@ -320,9 +480,24 @@ public:
|
|||
}
|
||||
|
||||
private:
|
||||
friend Detail::Utf16StringBase;
|
||||
friend Detail::Utf16StringData;
|
||||
|
||||
constexpr Utf16View(char const* string, size_t length_in_code_units)
|
||||
: m_string { .ascii = string }
|
||||
, m_length_in_code_units(length_in_code_units)
|
||||
{
|
||||
}
|
||||
|
||||
[[nodiscard]] size_t calculate_length_in_code_points() const;
|
||||
|
||||
char16_t const* m_string { nullptr };
|
||||
union {
|
||||
char const* ascii;
|
||||
char16_t const* utf16;
|
||||
} m_string { .ascii = nullptr };
|
||||
|
||||
// Just like Utf16StringData, we store whether this string has ASCII or UTF-16 storage by setting the most
|
||||
// significant bit of m_code_unit_length for UTF-16 storage.
|
||||
size_t m_length_in_code_units { 0 };
|
||||
mutable size_t m_length_in_code_points { NumericLimits<size_t>::max() };
|
||||
};
|
||||
|
@ -342,6 +517,16 @@ struct Traits<Utf16View> : public DefaultTraits<Utf16View> {
|
|||
static unsigned hash(Utf16View const& s) { return s.hash(); }
|
||||
};
|
||||
|
||||
namespace Detail {
|
||||
|
||||
template<>
|
||||
inline constexpr bool IsHashCompatible<Utf16View, Utf16String> = true;
|
||||
|
||||
template<>
|
||||
inline constexpr bool IsHashCompatible<Utf16String, Utf16View> = true;
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE AK_STRING_VIEW_LITERAL_CONSTEVAL AK::Utf16View operator""sv(char16_t const* string, size_t length)
|
||||
|
|
|
@ -6,7 +6,9 @@
|
|||
|
||||
#pragma once
|
||||
|
||||
#include <AK/AllOf.h>
|
||||
#include <AK/Assertions.h>
|
||||
#include <AK/CharacterTypes.h>
|
||||
#include <AK/Checked.h>
|
||||
#include <AK/Format.h>
|
||||
#include <AK/Types.h>
|
||||
|
@ -99,6 +101,12 @@ public:
|
|||
bool is_null() const { return !m_code_points; }
|
||||
size_t length() const { return m_length; }
|
||||
|
||||
bool is_ascii() const
|
||||
{
|
||||
// FIXME: Petition simdutf to implement an ASCII validator for UTF-32.
|
||||
return all_of(*this, AK::is_ascii);
|
||||
}
|
||||
|
||||
size_t iterator_offset(Utf32CodePointIterator const& it) const
|
||||
{
|
||||
VERIFY(it.m_ptr >= m_code_points);
|
||||
|
|
|
@ -1451,7 +1451,7 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
|
|||
auto ref_length = ref.length_in_code_units();
|
||||
|
||||
// k. Set result to the string-concatenation of result and refReplacement.
|
||||
result.append(ref_replacement.span().data(), ref_replacement.length_in_code_units());
|
||||
result.append(ref_replacement.utf16_span().data(), ref_replacement.length_in_code_units());
|
||||
|
||||
// j. Set templateRemainder to the substring of templateRemainder from refLength.
|
||||
// NOTE: We do this step last because refReplacement may point to templateRemainder.
|
||||
|
|
|
@ -44,7 +44,13 @@ NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16View const& view)
|
|||
{
|
||||
Utf16Data string;
|
||||
string.ensure_capacity(view.length_in_code_units());
|
||||
string.unchecked_append(view.span().data(), view.length_in_code_units());
|
||||
|
||||
if (view.has_ascii_storage()) {
|
||||
for (size_t i = 0; i < view.length_in_code_units(); ++i)
|
||||
string.unchecked_append(static_cast<char16_t>(view.code_unit_at(i)));
|
||||
} else {
|
||||
string.unchecked_append(view.utf16_span().data(), view.length_in_code_units());
|
||||
}
|
||||
|
||||
auto impl = create(move(string));
|
||||
if (auto length_in_code_points = view.length_in_code_points_if_known(); length_in_code_points.has_value())
|
||||
|
|
|
@ -75,7 +75,12 @@ public:
|
|||
|
||||
virtual void set_segmented_text(Utf16View const& text) override
|
||||
{
|
||||
m_segmented_text = icu::UnicodeString { text.span().data(), static_cast<i32>(text.length_in_code_units()) };
|
||||
if (text.has_ascii_storage()) {
|
||||
set_segmented_text(MUST(text.to_utf8()));
|
||||
return;
|
||||
}
|
||||
|
||||
m_segmented_text = icu::UnicodeString { text.utf16_span().data(), static_cast<i32>(text.length_in_code_units()) };
|
||||
m_segmenter->setText(m_segmented_text.get<icu::UnicodeString>());
|
||||
}
|
||||
|
||||
|
|
|
@ -89,9 +89,9 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
|
|||
|
||||
Utf16Data full_data;
|
||||
full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units());
|
||||
full_data.append(before_data.span().data(), before_data.length_in_code_units());
|
||||
full_data.append(before_data.utf16_span().data(), before_data.length_in_code_units());
|
||||
full_data.extend(inserted_data_result.data);
|
||||
full_data.append(after_data.span().data(), after_data.length_in_code_units());
|
||||
full_data.append(after_data.utf16_span().data(), after_data.length_in_code_units());
|
||||
Utf16View full_view { full_data };
|
||||
|
||||
bool characters_are_the_same = utf16_view == full_view;
|
||||
|
|
|
@ -76,6 +76,7 @@ set(AK_TEST_SOURCES
|
|||
TestTypeTraits.cpp
|
||||
TestTypedTransfer.cpp
|
||||
TestUFixedBigInt.cpp
|
||||
TestUtf16String.cpp
|
||||
TestUtf16View.cpp
|
||||
TestUtf8View.cpp
|
||||
TestVariant.cpp
|
||||
|
|
516
Tests/AK/TestUtf16String.cpp
Normal file
516
Tests/AK/TestUtf16String.cpp
Normal file
|
@ -0,0 +1,516 @@
|
|||
/*
|
||||
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <LibTest/TestCase.h>
|
||||
|
||||
#include <AK/Array.h>
|
||||
#include <AK/CharacterTypes.h>
|
||||
#include <AK/Enumerate.h>
|
||||
#include <AK/StringBuilder.h>
|
||||
#include <AK/Utf16String.h>
|
||||
#include <AK/Utf32View.h>
|
||||
|
||||
static Utf16String make_copy(Utf16String const& string)
|
||||
{
|
||||
return string.has_ascii_storage()
|
||||
? Utf16String::from_utf8(string.ascii_view())
|
||||
: Utf16String::from_utf16(string.utf16_view());
|
||||
}
|
||||
|
||||
TEST_CASE(empty_string)
|
||||
{
|
||||
Utf16String string {};
|
||||
EXPECT(string.is_empty());
|
||||
EXPECT(string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 0uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 0uz);
|
||||
EXPECT_EQ(string.ascii_view(), StringView {});
|
||||
}
|
||||
|
||||
TEST_CASE(from_utf8)
|
||||
{
|
||||
{
|
||||
auto string = Utf16String::from_utf8("hello!"sv);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 6uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 6uz);
|
||||
EXPECT_EQ(string.ascii_view(), "hello!"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::from_utf8("hello there!"sv);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(string.is_ascii());
|
||||
EXPECT(string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 12uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 12uz);
|
||||
EXPECT_EQ(string.ascii_view(), "hello there!"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::from_utf8("😀"sv);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(!string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 2uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 1uz);
|
||||
EXPECT_EQ(string.utf16_view(), u"😀"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::from_utf8("hello 😀 there!"sv);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(!string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 15uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 14uz);
|
||||
EXPECT_EQ(string.utf16_view(), u"hello 😀 there!"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::from_utf8("hello \xed\xa0\x80!"sv);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(!string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 8uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 8uz);
|
||||
EXPECT_EQ(string.utf16_view(), u"hello \xd800!"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::from_utf8("hello \xed\xb0\x80!"sv);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(!string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 8uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 8uz);
|
||||
EXPECT_EQ(string.utf16_view(), u"hello \xdc00!"sv);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE(from_utf16)
|
||||
{
|
||||
{
|
||||
auto string = Utf16String::from_utf16(u"hello!"sv);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 6uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 6uz);
|
||||
EXPECT_EQ(string.ascii_view(), "hello!"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::from_utf16(u"hello there!"sv);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(string.is_ascii());
|
||||
EXPECT(string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 12uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 12uz);
|
||||
EXPECT_EQ(string.ascii_view(), "hello there!"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::from_utf16(u"😀"sv);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(!string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 2uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 1uz);
|
||||
EXPECT_EQ(string.utf16_view(), u"😀"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::from_utf16(u"hello 😀 there!"sv);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(!string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 15uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 14uz);
|
||||
EXPECT_EQ(string.utf16_view(), u"hello 😀 there!"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::from_utf16(u"hello \xd800!"sv);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(!string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 8uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 8uz);
|
||||
EXPECT_EQ(string.utf16_view(), u"hello \xd800!"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::from_utf16(u"hello \xdc00!"sv);
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(!string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 8uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 8uz);
|
||||
EXPECT_EQ(string.utf16_view(), u"hello \xdc00!"sv);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE(from_utf32)
|
||||
{
|
||||
auto strlen32 = [](char32_t const* string) {
|
||||
auto const* start = string;
|
||||
while (*start)
|
||||
++start;
|
||||
return static_cast<size_t>(start - string);
|
||||
};
|
||||
|
||||
auto to_utf32_view = [&](char32_t const* string) {
|
||||
return Utf32View { reinterpret_cast<u32 const*>(string), strlen32(string) };
|
||||
};
|
||||
|
||||
{
|
||||
auto string = Utf16String::from_utf32(to_utf32_view(U"hello!"));
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 6uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 6uz);
|
||||
EXPECT_EQ(string.ascii_view(), "hello!"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::from_utf32(to_utf32_view(U"hello there!"));
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(string.is_ascii());
|
||||
EXPECT(string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 12uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 12uz);
|
||||
EXPECT_EQ(string.ascii_view(), "hello there!"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::from_utf32(to_utf32_view(U"😀"));
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(!string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 2uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 1uz);
|
||||
EXPECT_EQ(string.utf16_view(), u"😀"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::from_utf32(to_utf32_view(U"hello 😀 there!"));
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(!string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 15uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 14uz);
|
||||
EXPECT_EQ(string.utf16_view(), u"hello 😀 there!"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::from_utf32(to_utf32_view(U"hello \xd800!"));
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(!string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 8uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 8uz);
|
||||
EXPECT_EQ(string.utf16_view(), u"hello \xd800!"sv);
|
||||
}
|
||||
{
|
||||
auto string = Utf16String::from_utf32(to_utf32_view(U"hello \xdc00!"));
|
||||
EXPECT(!string.is_empty());
|
||||
EXPECT(!string.is_ascii());
|
||||
EXPECT(!string.has_long_ascii_storage());
|
||||
EXPECT(!string.has_short_ascii_storage());
|
||||
EXPECT_EQ(string.length_in_code_units(), 8uz);
|
||||
EXPECT_EQ(string.length_in_code_points(), 8uz);
|
||||
EXPECT_EQ(string.utf16_view(), u"hello \xdc00!"sv);
|
||||
}
|
||||
}
|
||||
|
||||
TEST_CASE(copy_operations)
|
||||
{
|
||||
auto test = [](Utf16String const& string1) {
|
||||
auto original = make_copy(string1);
|
||||
|
||||
// Copy constructor.
|
||||
Utf16String string2(string1);
|
||||
|
||||
EXPECT_EQ(string1, original);
|
||||
EXPECT_EQ(string1, string2);
|
||||
|
||||
// Copy assignment.
|
||||
Utf16String string3;
|
||||
string3 = string1;
|
||||
|
||||
EXPECT_EQ(string1, original);
|
||||
EXPECT_EQ(string1, string3);
|
||||
};
|
||||
|
||||
test({});
|
||||
test("hello"_utf16);
|
||||
test("hello there general!"_utf16);
|
||||
test("hello 😀 there!"_utf16);
|
||||
}
|
||||
|
||||
TEST_CASE(move_operations)
|
||||
{
|
||||
auto test = [](Utf16String string1) {
|
||||
auto original = make_copy(string1);
|
||||
|
||||
// Move constructor.
|
||||
Utf16String string2(move(string1));
|
||||
|
||||
EXPECT(string1.is_empty());
|
||||
EXPECT_EQ(string1, Utf16String {});
|
||||
EXPECT_EQ(string2, original);
|
||||
|
||||
// Move assignment.
|
||||
Utf16String string3;
|
||||
string3 = move(string2);
|
||||
|
||||
EXPECT(string2.is_empty());
|
||||
EXPECT_EQ(string2, Utf16String {});
|
||||
EXPECT_EQ(string3, original);
|
||||
};
|
||||
|
||||
test({});
|
||||
test("hello"_utf16);
|
||||
test("hello there general!"_utf16);
|
||||
test("hello 😀 there!"_utf16);
|
||||
}
|
||||
|
||||
TEST_CASE(equals)
|
||||
{
|
||||
auto test = [](Utf16String const& string1, Utf16String const& inequal_string) {
|
||||
auto string2 = make_copy(string1);
|
||||
|
||||
EXPECT_EQ(string1, string1);
|
||||
EXPECT_EQ(string1, string2);
|
||||
EXPECT_EQ(string2, string1);
|
||||
EXPECT_EQ(string2, string2);
|
||||
|
||||
if (string1.has_long_utf16_storage()) {
|
||||
EXPECT_EQ(string1, string1.utf16_view());
|
||||
EXPECT_EQ(string1, string2.utf16_view());
|
||||
EXPECT_EQ(string2, string1.utf16_view());
|
||||
EXPECT_EQ(string2, string2.utf16_view());
|
||||
|
||||
EXPECT_EQ(string1.utf16_view(), string1);
|
||||
EXPECT_EQ(string1.utf16_view(), string2);
|
||||
EXPECT_EQ(string2.utf16_view(), string1);
|
||||
EXPECT_EQ(string2.utf16_view(), string2);
|
||||
}
|
||||
|
||||
EXPECT_NE(string1, inequal_string);
|
||||
EXPECT_NE(string2, inequal_string);
|
||||
EXPECT_NE(inequal_string, string1);
|
||||
EXPECT_NE(inequal_string, string2);
|
||||
|
||||
if (string1.has_long_utf16_storage()) {
|
||||
EXPECT_NE(string1, inequal_string.utf16_view());
|
||||
EXPECT_NE(string2, inequal_string.utf16_view());
|
||||
EXPECT_NE(inequal_string, string1.utf16_view());
|
||||
EXPECT_NE(inequal_string, string2.utf16_view());
|
||||
|
||||
EXPECT_NE(string1.utf16_view(), inequal_string);
|
||||
EXPECT_NE(string2.utf16_view(), inequal_string);
|
||||
EXPECT_NE(inequal_string.utf16_view(), string1);
|
||||
EXPECT_NE(inequal_string.utf16_view(), string2);
|
||||
}
|
||||
};
|
||||
|
||||
// Short (empty) ASCII string comparison.
|
||||
test(Utf16String {}, "hello"_utf16);
|
||||
|
||||
// Short ASCII string comparison.
|
||||
test("hello"_utf16, "there"_utf16);
|
||||
|
||||
// Short and long ASCII string comparison.
|
||||
test("hello"_utf16, "hello there general!"_utf16);
|
||||
|
||||
// Long ASCII string comparison.
|
||||
test("hello there!"_utf16, "hello there general!"_utf16);
|
||||
|
||||
// UTF-16 string comparison.
|
||||
test("😀"_utf16, "hello 😀"_utf16);
|
||||
|
||||
// Short ASCII and UTF-16 string comparison.
|
||||
test("hello"_utf16, "😀"_utf16);
|
||||
|
||||
// Short ASCII and UTF-16 string of same code unit length comparison.
|
||||
test("ab"_utf16, "😀"_utf16);
|
||||
|
||||
// Long ASCII and UTF-16 string comparison.
|
||||
test("hello there general!"_utf16, "😀"_utf16);
|
||||
|
||||
// Long ASCII and UTF-16 string of same code unit length comparison.
|
||||
test("ababababab"_utf16, "😀😀😀😀😀"_utf16);
|
||||
}
|
||||
|
||||
TEST_CASE(equals_ascii)
|
||||
{
|
||||
auto test = [](StringView ascii, Utf16String const& inequal_string) {
|
||||
auto string = Utf16String::from_utf8(ascii);
|
||||
|
||||
EXPECT_EQ(ascii, string);
|
||||
EXPECT_EQ(string, ascii);
|
||||
|
||||
EXPECT_NE(ascii, inequal_string);
|
||||
EXPECT_NE(inequal_string, ascii);
|
||||
};
|
||||
|
||||
// Short (empty) ASCII string comparison.
|
||||
test({}, "hello"_utf16);
|
||||
|
||||
// Short ASCII string comparison.
|
||||
test("hello"sv, "there"_utf16);
|
||||
|
||||
// Short and long ASCII string comparison.
|
||||
test("hello"sv, "hello there general!"_utf16);
|
||||
|
||||
// Long ASCII string comparison.
|
||||
test("hello there!"sv, "hello there general!"_utf16);
|
||||
|
||||
// Short ASCII and UTF-16 string comparison.
|
||||
test("hello"sv, "😀"_utf16);
|
||||
|
||||
// Short ASCII and UTF-16 string of same code unit length comparison.
|
||||
test("ab"sv, "😀"_utf16);
|
||||
|
||||
// Long ASCII and UTF-16 string comparison.
|
||||
test("hello there general!"sv, "😀"_utf16);
|
||||
|
||||
// Long ASCII and UTF-16 string of same code unit length comparison.
|
||||
test("ababababab"sv, "😀😀😀😀😀"_utf16);
|
||||
|
||||
// Non-ASCII string comparison.
|
||||
EXPECT_NE("😀"sv, "😀"_utf16);
|
||||
}
|
||||
|
||||
TEST_CASE(equals_ignoring_ascii_case)
|
||||
{
|
||||
auto test = [](Utf16String const& string1, Utf16String const& inequal_string) {
|
||||
StringBuilder builder;
|
||||
for (auto [i, code_point] : enumerate(string1))
|
||||
builder.append_code_point(i % 2 == 0 ? to_ascii_uppercase(code_point) : code_point);
|
||||
|
||||
auto string2 = Utf16String::from_utf8(builder.string_view());
|
||||
|
||||
EXPECT(string1.equals_ignoring_ascii_case(string1));
|
||||
EXPECT(string1.equals_ignoring_ascii_case(string2));
|
||||
EXPECT(string2.equals_ignoring_ascii_case(string1));
|
||||
EXPECT(string2.equals_ignoring_ascii_case(string2));
|
||||
|
||||
if (string1.has_long_utf16_storage()) {
|
||||
EXPECT(string1.equals_ignoring_ascii_case(string1.utf16_view()));
|
||||
EXPECT(string1.equals_ignoring_ascii_case(string2.utf16_view()));
|
||||
EXPECT(string2.equals_ignoring_ascii_case(string1.utf16_view()));
|
||||
EXPECT(string2.equals_ignoring_ascii_case(string2.utf16_view()));
|
||||
}
|
||||
|
||||
EXPECT(!string1.equals_ignoring_ascii_case(inequal_string));
|
||||
EXPECT(!string2.equals_ignoring_ascii_case(inequal_string));
|
||||
EXPECT(!inequal_string.equals_ignoring_ascii_case(string1));
|
||||
EXPECT(!inequal_string.equals_ignoring_ascii_case(string2));
|
||||
|
||||
if (string1.has_long_utf16_storage()) {
|
||||
EXPECT(!string1.equals_ignoring_ascii_case(inequal_string.utf16_view()));
|
||||
EXPECT(!string2.equals_ignoring_ascii_case(inequal_string.utf16_view()));
|
||||
EXPECT(!inequal_string.equals_ignoring_ascii_case(string1.utf16_view()));
|
||||
EXPECT(!inequal_string.equals_ignoring_ascii_case(string2.utf16_view()));
|
||||
}
|
||||
};
|
||||
|
||||
// Short (empty) ASCII string comparison.
|
||||
test(Utf16String {}, "hello"_utf16);
|
||||
|
||||
// Short ASCII string comparison.
|
||||
test("hello"_utf16, "there"_utf16);
|
||||
|
||||
// Short and long ASCII string comparison.
|
||||
test("hello"_utf16, "hello there general!"_utf16);
|
||||
|
||||
// Long ASCII string comparison.
|
||||
test("hello there!"_utf16, "hello there general!"_utf16);
|
||||
|
||||
// UTF-16 string comparison.
|
||||
test("😀"_utf16, "hello 😀"_utf16);
|
||||
|
||||
// Short ASCII and UTF-16 string comparison.
|
||||
test("hello"_utf16, "😀"_utf16);
|
||||
|
||||
// Short ASCII and UTF-16 string of same code unit length comparison.
|
||||
test("ab"_utf16, "😀"_utf16);
|
||||
|
||||
// Long ASCII and UTF-16 string comparison.
|
||||
test("hello there general!"_utf16, "😀"_utf16);
|
||||
|
||||
// Long ASCII and UTF-16 string of same code unit length comparison.
|
||||
test("ababababab"_utf16, "😀😀😀😀😀"_utf16);
|
||||
}
|
||||
|
||||
TEST_CASE(iteration)
|
||||
{
|
||||
auto test = [](Utf16String const& string, ReadonlySpan<u32> code_points) {
|
||||
EXPECT_EQ(string.length_in_code_points(), code_points.size());
|
||||
|
||||
for (auto [i, code_point] : enumerate(string)) {
|
||||
if (code_points.size() == 0)
|
||||
FAIL("Iterating an empty UTF-16 string should not produce any values");
|
||||
else
|
||||
EXPECT_EQ(code_point, code_points[i]);
|
||||
}
|
||||
|
||||
auto iterator = string.end();
|
||||
EXPECT_DEATH("Dereferencing a UTF-16 iterator which is at its end", *iterator);
|
||||
EXPECT_DEATH("Incrementing a UTF-16 iterator which is at its end", ++iterator);
|
||||
};
|
||||
|
||||
test({}, {});
|
||||
test("hello"_utf16, { { 'h', 'e', 'l', 'l', 'o' } });
|
||||
test("hello there general!"_utf16, { { 'h', 'e', 'l', 'l', 'o', ' ', 't', 'h', 'e', 'r', 'e', ' ', 'g', 'e', 'n', 'e', 'r', 'a', 'l', '!' } });
|
||||
test("😀"_utf16, { { 0x1f600 } });
|
||||
test("hello 😀 there!"_utf16, { { 'h', 'e', 'l', 'l', 'o', ' ', 0x1f600, ' ', 't', 'h', 'e', 'r', 'e', '!' } });
|
||||
}
|
||||
|
||||
TEST_CASE(code_unit_at)
|
||||
{
|
||||
auto test = [](Utf16View const& view, size_t length_in_code_units) {
|
||||
auto string = Utf16String::from_utf16(view);
|
||||
EXPECT_EQ(string.length_in_code_units(), length_in_code_units);
|
||||
|
||||
for (size_t i = 0; i < length_in_code_units; ++i)
|
||||
EXPECT_EQ(string.code_unit_at(i), view.code_unit_at(i));
|
||||
};
|
||||
|
||||
test({}, 0);
|
||||
test(u"hello"sv, 5);
|
||||
test(u"hello there general!"sv, 20);
|
||||
test(u"😀"sv, 2);
|
||||
test(u"hello 😀 there!"sv, 15);
|
||||
}
|
||||
|
||||
TEST_CASE(code_point_at)
|
||||
{
|
||||
auto test = [](Utf16View const& view, size_t length_in_code_points) {
|
||||
auto string = Utf16String::from_utf16(view);
|
||||
EXPECT_EQ(string.length_in_code_points(), length_in_code_points);
|
||||
|
||||
for (size_t i = 0; i < string.length_in_code_units(); ++i)
|
||||
EXPECT_EQ(string.code_point_at(i), view.code_point_at(i));
|
||||
};
|
||||
|
||||
test({}, 0);
|
||||
test(u"hello"sv, 5);
|
||||
test(u"hello there general!"sv, 20);
|
||||
test(u"😀"sv, 1);
|
||||
test(u"hello 😀 there!"sv, 14);
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue