diff --git a/AK/CMakeLists.txt b/AK/CMakeLists.txt index 72169d3ff7b..d4cf43596cc 100644 --- a/AK/CMakeLists.txt +++ b/AK/CMakeLists.txt @@ -29,6 +29,8 @@ set(SOURCES StringUtils.cpp StringView.cpp Time.cpp + Utf16String.cpp + Utf16StringData.cpp Utf16View.cpp Utf32View.cpp Utf8View.cpp diff --git a/AK/Forward.h b/AK/Forward.h index 2f5fa46b71b..4c602156d49 100644 --- a/AK/Forward.h +++ b/AK/Forward.h @@ -19,6 +19,7 @@ template class ByteBuffer; class StringData; +class Utf16StringData; } @@ -52,6 +53,7 @@ class String; class StringBuilder; class StringView; class UnixDateTime; +class Utf16String; class Utf16View; class Utf32CodePointIterator; class Utf32View; @@ -198,6 +200,7 @@ using AK::StringView; using AK::TrailingCodePointTransformation; using AK::Traits; using AK::UnixDateTime; +using AK::Utf16String; using AK::Utf16View; using AK::Utf32CodePointIterator; using AK::Utf32View; diff --git a/AK/StringBuilder.cpp b/AK/StringBuilder.cpp index 53bfa358ae9..82409dbbf44 100644 --- a/AK/StringBuilder.cpp +++ b/AK/StringBuilder.cpp @@ -249,8 +249,10 @@ ErrorOr StringBuilder::try_append(Utf16View const& utf16_view) { if (utf16_view.is_empty()) return {}; + if (utf16_view.has_ascii_storage()) + return try_append(utf16_view.bytes()); - auto remaining_view = utf16_view.span(); + auto remaining_view = utf16_view.utf16_span(); auto maximum_utf8_length = UnicodeUtils::maximum_utf8_length_from_utf16(remaining_view); // Possibly over-allocate a little to ensure we don't have to allocate later. diff --git a/AK/Utf16String.cpp b/AK/Utf16String.cpp new file mode 100644 index 00000000000..ecd1e9db32b --- /dev/null +++ b/AK/Utf16String.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2025, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include + +#include + +namespace AK { + +static_assert(sizeof(Detail::ShortString) == sizeof(Detail::Utf16StringData*)); + +Utf16String Utf16String::from_utf8_without_validation(StringView utf8_string) +{ + if (utf8_string.length() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf8_string.is_ascii()) { + Utf16String string; + string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(utf8_string.length()); + + auto result = utf8_string.bytes().copy_to(string.m_value.short_ascii_string.storage); + VERIFY(result == utf8_string.length()); + + return string; + } + + return Utf16String { Detail::Utf16StringData::from_utf8(utf8_string, Detail::Utf16StringData::AllowASCIIStorage::Yes) }; +} + +Utf16String Utf16String::from_utf16_without_validation(Utf16View const& utf16_string) +{ + if (utf16_string.length_in_code_units() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf16_string.is_ascii()) { + Utf16String string; + string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(utf16_string.length_in_code_units()); + + if (utf16_string.has_ascii_storage()) { + auto result = utf16_string.bytes().copy_to(string.m_value.short_ascii_string.storage); + VERIFY(result == utf16_string.length_in_code_units()); + } else { + auto result = simdutf::convert_utf16_to_utf8(utf16_string.utf16_span().data(), utf16_string.length_in_code_units(), reinterpret_cast(string.m_value.short_ascii_string.storage)); + VERIFY(result == utf16_string.length_in_code_units()); + } + + return string; + } + + return Utf16String { Detail::Utf16StringData::from_utf16(utf16_string) }; +} + +Utf16String Utf16String::from_utf32(Utf32View const& utf32_string) +{ + if (utf32_string.length() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && utf32_string.is_ascii()) { + Utf16String string; + string.m_value.short_ascii_string = Detail::ShortString::create_with_byte_count(utf32_string.length()); + + auto result = simdutf::convert_utf32_to_utf8(reinterpret_cast(utf32_string.code_points()), utf32_string.length(), reinterpret_cast(string.m_value.short_ascii_string.storage)); + VERIFY(result == utf32_string.length()); + + return string; + } + + return Utf16String { Detail::Utf16StringData::from_utf32(utf32_string) }; +} + +ErrorOr Formatter::format(FormatBuilder& builder, Utf16String const& utf16_string) +{ + if (utf16_string.has_long_utf16_storage()) + return builder.builder().try_append(utf16_string.utf16_view()); + return builder.put_string(utf16_string.ascii_view()); +} + +} diff --git a/AK/Utf16String.h b/AK/Utf16String.h new file mode 100644 index 00000000000..44ba0974482 --- /dev/null +++ b/AK/Utf16String.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2025, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace AK { + +// Utf16String is a strongly owned sequence of Unicode code points encoded as UTF-16. +// +// The data may or may not be heap-allocated, and may or may not be reference counted. As a memory optimization, if the +// UTF-16 string is entirely ASCII, the string is stored as 8-bit bytes. +class [[nodiscard]] Utf16String : public Detail::Utf16StringBase { +public: + using Utf16StringBase::Utf16StringBase; + + explicit constexpr Utf16String(Utf16StringBase&& base) + : Utf16StringBase(move(base)) + { + } + + ALWAYS_INLINE static Utf16String from_utf8(StringView utf8_string) + { + VERIFY(Utf8View { utf8_string }.validate()); + return from_utf8_without_validation(utf8_string); + } + + ALWAYS_INLINE static Utf16String from_utf8(String const& utf8_string) + { + return from_utf8_without_validation(utf8_string); + } + + ALWAYS_INLINE static ErrorOr try_from_utf8(StringView utf8_string) + { + if (!Utf8View { utf8_string }.validate()) + return Error::from_string_literal("Input was not valid UTF-8"); + return from_utf8_without_validation(utf8_string); + } + + ALWAYS_INLINE static Utf16String from_utf16(Utf16View const& utf16_string) + { + VERIFY(utf16_string.validate()); + return from_utf16_without_validation(utf16_string); + } + + ALWAYS_INLINE static ErrorOr try_from_utf16(Utf16View const& utf16_string) + { + if (!utf16_string.validate()) + return Error::from_string_literal("Input was not valid UTF-16"); + return from_utf16_without_validation(utf16_string); + } + + static Utf16String from_utf8_without_validation(StringView); + static Utf16String from_utf16_without_validation(Utf16View const&); + static Utf16String from_utf32(Utf32View const&); + + template + requires(IsOneOf, Utf16String>) + static Utf16String from_utf16(T&&) = delete; + + template + requires(IsOneOf, Utf16String>) + static ErrorOr try_from_utf16(T&&) = delete; + + template + requires(IsOneOf, Utf16String>) + static Utf16String from_utf16_without_validation(T&&) = delete; + +private: + ALWAYS_INLINE explicit Utf16String(NonnullRefPtr value) + : Utf16StringBase(move(value)) + { + } +}; + +template<> +struct Formatter : Formatter { + ErrorOr format(FormatBuilder&, Utf16String const&); +}; + +template<> +struct Traits : public DefaultTraits { + static unsigned hash(Utf16String const& s) { return s.hash(); } +}; + +} + +[[nodiscard]] ALWAYS_INLINE AK::Utf16String operator""_utf16(char const* string, size_t length) +{ + AK::StringView view { string, length }; + + ASSERT(AK::Utf8View { view }.validate()); + return AK::Utf16String::from_utf8_without_validation(view); +} + +[[nodiscard]] ALWAYS_INLINE AK::Utf16String operator""_utf16(char16_t const* string, size_t length) +{ + AK::Utf16View view { string, length }; + + ASSERT(view.validate()); + return AK::Utf16String::from_utf16_without_validation(view); +} diff --git a/AK/Utf16StringBase.h b/AK/Utf16StringBase.h new file mode 100644 index 00000000000..31b20fde526 --- /dev/null +++ b/AK/Utf16StringBase.h @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2025, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include +#include +#include +#include + +namespace AK::Detail { + +class Utf16StringBase { +public: + constexpr Utf16StringBase() + : Utf16StringBase(ShortString::create_empty()) + { + } + + explicit constexpr Utf16StringBase(ShortString short_string) + : m_value { .short_ascii_string = short_string } + { + } + + ALWAYS_INLINE explicit Utf16StringBase(NonnullRefPtr value) + : m_value { .data = &value.leak_ref() } + { + } + + ALWAYS_INLINE Utf16StringBase(Utf16StringBase const& other) + : m_value(other.m_value) + { + if (has_long_storage()) + data_without_union_member_assertion()->ref(); + } + + constexpr Utf16StringBase(Utf16StringBase&& other) + : m_value(other.m_value) + { + other.m_value = { .short_ascii_string = ShortString::create_empty() }; + } + + constexpr ~Utf16StringBase() + { + if (!is_constant_evaluated()) + destroy_string(); + } + + ALWAYS_INLINE operator Utf16View() const& { return utf16_view(); } + explicit operator Utf16View() const&& = delete; + + [[nodiscard]] ALWAYS_INLINE String to_utf8(AllowLonelySurrogates allow_lonely_surrogates = AllowLonelySurrogates::Yes) const + { + return MUST(utf16_view().to_utf8(allow_lonely_surrogates)); + } + + [[nodiscard]] ALWAYS_INLINE String to_utf8_but_should_be_ported_to_utf16(AllowLonelySurrogates allow_lonely_surrogates = AllowLonelySurrogates::Yes) const + { + return to_utf8(allow_lonely_surrogates); + } + + [[nodiscard]] ALWAYS_INLINE ByteString to_byte_string(AllowLonelySurrogates allow_lonely_surrogates = AllowLonelySurrogates::Yes) const + { + return MUST(utf16_view().to_byte_string(allow_lonely_surrogates)); + } + + [[nodiscard]] ALWAYS_INLINE StringView ascii_view() const& + { + if (has_short_ascii_storage()) + return short_ascii_string_without_union_member_assertion().bytes(); + + VERIFY(has_long_ascii_storage()); + return data_without_union_member_assertion()->ascii_view(); + } + + [[nodiscard]] ALWAYS_INLINE Utf16View utf16_view() const& + { + if (has_short_ascii_storage()) + return Utf16View { ascii_view().characters_without_null_termination(), length_in_code_units() }; + return data_without_union_member_assertion()->utf16_view(); + } + + StringView ascii_view() const&& = delete; + Utf16View utf16_view() const&& = delete; + + ALWAYS_INLINE Utf16StringBase& operator=(Utf16StringBase const& other) + { + if (&other != this) { + if (has_long_storage()) + data_without_union_member_assertion()->unref(); + + m_value = other.m_value; + + if (has_long_storage()) + data_without_union_member_assertion()->ref(); + } + + return *this; + } + + ALWAYS_INLINE Utf16StringBase& operator=(Utf16StringBase&& other) + { + if (has_long_storage()) + data_without_union_member_assertion()->unref(); + + m_value = exchange(other.m_value, { .short_ascii_string = ShortString::create_empty() }); + return *this; + } + + [[nodiscard]] ALWAYS_INLINE bool operator==(Utf16StringBase const& other) const + { + if (has_short_ascii_storage() && other.has_short_ascii_storage()) + return bit_cast(m_value) == bit_cast(other.m_value); + + if (has_long_storage() && other.has_long_storage()) + return *data_without_union_member_assertion() == *other.data_without_union_member_assertion(); + + return utf16_view() == other.utf16_view(); + } + + [[nodiscard]] ALWAYS_INLINE bool operator==(Utf16View const& other) const { return utf16_view() == other; } + [[nodiscard]] ALWAYS_INLINE bool operator==(StringView other) const { return utf16_view() == other; } + + [[nodiscard]] ALWAYS_INLINE bool equals_ignoring_ascii_case(Utf16View const& other) const { return utf16_view().equals_ignoring_ascii_case(other); } + [[nodiscard]] ALWAYS_INLINE bool equals_ignoring_ascii_case(Utf16StringBase const& other) const { return utf16_view().equals_ignoring_ascii_case(other.utf16_view()); } + + template + [[nodiscard]] ALWAYS_INLINE bool is_one_of(Ts&&... strings) const + { + return (this->operator==(forward(strings)) || ...); + } + + template + [[nodiscard]] ALWAYS_INLINE bool is_one_of_ignoring_ascii_case(Ts&&... strings) const + { + return (this->equals_ignoring_ascii_case(forward(strings)) || ...); + } + + [[nodiscard]] ALWAYS_INLINE u32 hash() const + { + if (has_short_ascii_storage()) + return StringView { short_ascii_string_without_union_member_assertion().bytes() }.hash(); + return data_without_union_member_assertion()->hash(); + } + + [[nodiscard]] ALWAYS_INLINE bool is_empty() const { return length_in_code_units() == 0uz; } + [[nodiscard]] ALWAYS_INLINE bool is_ascii() const { return utf16_view().is_ascii(); } + + [[nodiscard]] ALWAYS_INLINE size_t length_in_code_units() const + { + if (has_short_ascii_storage()) + return short_ascii_string_without_union_member_assertion().byte_count(); + return data_without_union_member_assertion()->length_in_code_units(); + } + + [[nodiscard]] ALWAYS_INLINE size_t length_in_code_points() const + { + if (has_short_ascii_storage()) + return short_ascii_string_without_union_member_assertion().byte_count(); + return data_without_union_member_assertion()->length_in_code_points(); + } + + [[nodiscard]] ALWAYS_INLINE char16_t code_unit_at(size_t code_unit_offset) const { return utf16_view().code_unit_at(code_unit_offset); } + [[nodiscard]] ALWAYS_INLINE u32 code_point_at(size_t code_unit_offset) const { return utf16_view().code_point_at(code_unit_offset); } + + [[nodiscard]] ALWAYS_INLINE size_t code_unit_offset_of(size_t code_point_offset) const + { + if (has_ascii_storage()) + return code_point_offset; + return utf16_view().code_unit_offset_of(code_point_offset); + } + + [[nodiscard]] ALWAYS_INLINE size_t code_point_offset_of(size_t code_unit_offset) const + { + if (has_ascii_storage()) + return code_unit_offset; + return utf16_view().code_point_offset_of(code_unit_offset); + } + + [[nodiscard]] ALWAYS_INLINE Utf16CodePointIterator begin() const { return utf16_view().begin(); } + [[nodiscard]] ALWAYS_INLINE Utf16CodePointIterator end() const { return utf16_view().end(); } + + [[nodiscard]] ALWAYS_INLINE Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const + { + return utf16_view().substring_view(code_unit_offset, code_unit_length); + } + + [[nodiscard]] ALWAYS_INLINE Utf16View substring_view(size_t code_unit_offset) const + { + return utf16_view().substring_view(code_unit_offset); + } + + ALWAYS_INLINE Optional find_code_unit_offset(char16_t needle, size_t start_offset = 0) const + { + return utf16_view().find_code_unit_offset(needle, start_offset); + } + + ALWAYS_INLINE Optional find_code_unit_offset(Utf16View const& needle, size_t start_offset = 0) const + { + return utf16_view().find_code_unit_offset(needle, start_offset); + } + + ALWAYS_INLINE Optional find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset = 0) const + { + return utf16_view().find_code_unit_offset_ignoring_case(needle, start_offset); + } + + [[nodiscard]] ALWAYS_INLINE bool starts_with(Utf16View const& needle) const { return utf16_view().starts_with(needle); } + + // This is primarily interesting to unit tests. + [[nodiscard]] constexpr bool has_short_ascii_storage() const + { + if (is_constant_evaluated()) + return (m_value.short_ascii_string.byte_count_and_short_string_flag & StringBase::SHORT_STRING_FLAG) != 0; + return (short_ascii_string_without_union_member_assertion().byte_count_and_short_string_flag & StringBase::SHORT_STRING_FLAG) != 0; + } + + // This is primarily interesting to unit tests. + [[nodiscard]] ALWAYS_INLINE bool has_long_ascii_storage() const + { + if (has_short_ascii_storage()) + return false; + return data_without_union_member_assertion()->has_ascii_storage(); + } + + // This is primarily interesting to unit tests. + [[nodiscard]] ALWAYS_INLINE bool has_ascii_storage() const + { + return has_short_ascii_storage() || has_long_ascii_storage(); + } + + // This is primarily interesting to unit tests. + [[nodiscard]] ALWAYS_INLINE bool has_long_utf16_storage() const + { + if (has_short_ascii_storage()) + return false; + return data_without_union_member_assertion()->has_utf16_storage(); + } + + // This is primarily interesting to unit tests. + [[nodiscard]] ALWAYS_INLINE bool has_long_storage() const + { + return !has_short_ascii_storage(); + } + +protected: + ALWAYS_INLINE void destroy_string() const + { + if (has_long_storage()) + data_without_union_member_assertion()->unref(); + } + + // This is technically **invalid**! See StringBase for details. + ALWAYS_INLINE ShortString const& short_ascii_string_without_union_member_assertion() const { return *__builtin_launder(&m_value.short_ascii_string); } + ALWAYS_INLINE Utf16StringData const* data_without_union_member_assertion() const { return *__builtin_launder(&m_value.data); } + + union { + ShortString short_ascii_string; + Utf16StringData const* data; + } m_value; +}; + +} diff --git a/AK/Utf16StringData.cpp b/AK/Utf16StringData.cpp new file mode 100644 index 00000000000..1361777d76e --- /dev/null +++ b/AK/Utf16StringData.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2025, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include +#include + +#include + +namespace AK::Detail { + +// Due to internal optimizations, we have an explicit maximum string length of 2**63 - 1. +#define VERIFY_UTF16_LENGTH(length) VERIFY(length >> Detail::UTF16_FLAG == 0); + +NonnullRefPtr Utf16StringData::create_uninitialized(StorageType storage_type, size_t code_unit_length) +{ + auto allocation_size = storage_type == Utf16StringData::StorageType::ASCII + ? sizeof(Utf16StringData) + (sizeof(char) * code_unit_length) + : sizeof(Utf16StringData) + (sizeof(char16_t) * code_unit_length); + + void* slot = malloc(allocation_size); + VERIFY(slot); + + return adopt_ref(*new (slot) Utf16StringData(storage_type, code_unit_length)); +} + +template +NonnullRefPtr Utf16StringData::create_from_code_point_iterable(ViewType const& view) +{ + size_t code_unit_length = 0; + size_t code_point_length = 0; + + for (auto code_point : view) { + code_unit_length += UnicodeUtils::code_unit_length_for_code_point(code_point); + ++code_point_length; + } + + VERIFY_UTF16_LENGTH(code_unit_length); + + auto string = create_uninitialized(StorageType::UTF16, code_unit_length); + string->m_length_in_code_points = code_point_length; + + size_t code_unit_index = 0; + + for (auto code_point : view) { + (void)UnicodeUtils::code_point_to_utf16(code_point, [&](auto code_unit) { + string->m_utf16_data[code_unit_index++] = code_unit; + }); + } + + return string; +} + +NonnullRefPtr Utf16StringData::from_utf8(StringView utf8_string, AllowASCIIStorage allow_ascii_storage) +{ + RefPtr string; + + if (allow_ascii_storage == AllowASCIIStorage::Yes && utf8_string.is_ascii()) { + VERIFY_UTF16_LENGTH(utf8_string.length()); + + string = create_uninitialized(StorageType::ASCII, utf8_string.length()); + TypedTransfer::copy(string->m_ascii_data, utf8_string.characters_without_null_termination(), utf8_string.length()); + } else if (Utf8View view { utf8_string }; view.validate(AllowLonelySurrogates::No)) { + auto code_unit_length = simdutf::utf16_length_from_utf8(utf8_string.characters_without_null_termination(), utf8_string.length()); + VERIFY_UTF16_LENGTH(code_unit_length); + + string = create_uninitialized(StorageType::UTF16, code_unit_length); + + auto result = simdutf::convert_utf8_to_utf16(utf8_string.characters_without_null_termination(), utf8_string.length(), string->m_utf16_data); + VERIFY(result == code_unit_length); + } else { + string = create_from_code_point_iterable(view); + } + + return string.release_nonnull(); +} + +NonnullRefPtr Utf16StringData::from_utf16(Utf16View const& utf16_string) +{ + VERIFY_UTF16_LENGTH(utf16_string.length_in_code_units()); + RefPtr string; + + if (utf16_string.has_ascii_storage()) { + string = create_uninitialized(StorageType::ASCII, utf16_string.length_in_code_units()); + TypedTransfer::copy(string->m_ascii_data, utf16_string.ascii_span().data(), utf16_string.length_in_code_units()); + } else if (utf16_string.is_ascii()) { + string = create_uninitialized(StorageType::ASCII, utf16_string.length_in_code_units()); + + auto result = simdutf::convert_utf16_to_utf8(utf16_string.utf16_span().data(), utf16_string.length_in_code_units(), string->m_ascii_data); + VERIFY(result == utf16_string.length_in_code_units()); + } else { + string = create_uninitialized(StorageType::UTF16, utf16_string.length_in_code_units()); + TypedTransfer::copy(string->m_utf16_data, utf16_string.utf16_span().data(), utf16_string.length_in_code_units()); + + string->m_length_in_code_points = utf16_string.m_length_in_code_points; + } + + return string.release_nonnull(); +} + +NonnullRefPtr Utf16StringData::from_utf32(Utf32View const& utf32_string) +{ + RefPtr string; + + auto const* utf32_data = reinterpret_cast(utf32_string.code_points()); + auto utf32_length = utf32_string.length(); + + if (utf32_string.is_ascii()) { + VERIFY_UTF16_LENGTH(utf32_length); + + string = create_uninitialized(StorageType::ASCII, utf32_length); + + auto result = simdutf::convert_utf32_to_utf8(utf32_data, utf32_length, string->m_ascii_data); + VERIFY(result == utf32_length); + } else if (simdutf::validate_utf32(utf32_data, utf32_length)) { + auto code_unit_length = simdutf::utf16_length_from_utf32(utf32_data, utf32_length); + VERIFY_UTF16_LENGTH(code_unit_length); + + string = create_uninitialized(StorageType::UTF16, code_unit_length); + string->m_length_in_code_points = utf32_length; + + auto result = simdutf::convert_utf32_to_utf16(utf32_data, utf32_length, string->m_utf16_data); + VERIFY(result == code_unit_length); + } else { + string = create_from_code_point_iterable(utf32_string); + } + + return string.release_nonnull(); +} + +size_t Utf16StringData::calculate_code_point_length() const +{ + ASSERT(!has_ascii_storage()); + + if (simdutf::validate_utf16(m_utf16_data, length_in_code_units())) + return simdutf::count_utf16(m_utf16_data, length_in_code_units()); + + size_t code_points = 0; + for ([[maybe_unused]] auto code_point : utf16_view()) + ++code_points; + return code_points; +} + +} diff --git a/AK/Utf16StringData.h b/AK/Utf16StringData.h new file mode 100644 index 00000000000..e0340a20d65 --- /dev/null +++ b/AK/Utf16StringData.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2025, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +namespace AK::Detail { + +class Utf16StringData final : public RefCounted { +public: + enum class StorageType : u8 { + ASCII, + UTF16, + }; + + enum class AllowASCIIStorage : u8 { + No, + Yes, + }; + + static NonnullRefPtr from_utf8(StringView, AllowASCIIStorage); + static NonnullRefPtr from_utf16(Utf16View const&); + static NonnullRefPtr from_utf32(Utf32View const&); + + ~Utf16StringData() = default; + + void operator delete(void* ptr) + { + free(ptr); + } + + [[nodiscard]] ALWAYS_INLINE bool operator==(Utf16StringData const& other) const + { + return utf16_view() == other.utf16_view(); + } + + [[nodiscard]] ALWAYS_INLINE bool operator==(Utf16View const& other) const + { + return utf16_view() == other; + } + + [[nodiscard]] ALWAYS_INLINE bool operator==(StringView const& other) const + { + if (has_ascii_storage()) + return ascii_view() == other; + return utf16_view() == Utf16View { other.characters_without_null_termination(), other.length() }; + } + + [[nodiscard]] ALWAYS_INLINE bool has_ascii_storage() const { return m_length_in_code_units >> Detail::UTF16_FLAG == 0; } + [[nodiscard]] ALWAYS_INLINE bool has_utf16_storage() const { return m_length_in_code_units >> Detail::UTF16_FLAG != 0; } + + ALWAYS_INLINE u32 hash() const + { + if (!m_has_hash) + m_hash = calculate_hash(); + return m_hash; + } + + [[nodiscard]] ALWAYS_INLINE size_t length_in_code_units() const { return m_length_in_code_units & ~(1uz << Detail::UTF16_FLAG); } + [[nodiscard]] ALWAYS_INLINE size_t length_in_code_points() const + { + if (has_ascii_storage()) + return length_in_code_units(); + if (m_length_in_code_points == NumericLimits::max()) + m_length_in_code_points = calculate_code_point_length(); + return m_length_in_code_points; + } + + [[nodiscard]] ALWAYS_INLINE StringView ascii_view() const + { + ASSERT(has_ascii_storage()); + return { m_ascii_data, length_in_code_units() }; + } + + [[nodiscard]] ALWAYS_INLINE Utf16View utf16_view() const + { + if (has_ascii_storage()) + return { m_ascii_data, length_in_code_units() }; + + Utf16View view { m_utf16_data, length_in_code_units() }; + view.m_length_in_code_points = m_length_in_code_points; + + return view; + } + +private: + ALWAYS_INLINE Utf16StringData(StorageType storage_type, size_t code_unit_length) + : m_length_in_code_units(code_unit_length) + { + if (storage_type == StorageType::UTF16) + m_length_in_code_units |= 1uz << Detail::UTF16_FLAG; + } + + static NonnullRefPtr create_uninitialized(StorageType storage_type, size_t code_unit_length); + + template + static NonnullRefPtr create_from_code_point_iterable(ViewType const&); + + [[nodiscard]] size_t calculate_code_point_length() const; + + [[nodiscard]] ALWAYS_INLINE u32 calculate_hash() const + { + if (has_ascii_storage()) + return ascii_view().hash(); + return utf16_view().hash(); + } + + // We store whether this string has ASCII or UTF-16 storage by setting the most significant bit of m_length_in_code_units + // to 1 for UTF-16 storage. This shrinks the size of most UTF-16 string related classes, at the cost of not being + // allowed to create a string larger than 2**63 - 1. + size_t m_length_in_code_units { 0 }; + mutable size_t m_length_in_code_points { NumericLimits::max() }; + + mutable u32 m_hash { 0 }; + mutable bool m_has_hash { false }; + + union { + char m_ascii_data[0]; + char16_t m_utf16_data[0]; + }; +}; + +} diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index 1ab94e296be..f797acb2bdf 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -99,15 +99,19 @@ ErrorOr Utf16View::to_utf8(AllowLonelySurrogates allow_lonely_surrogates { if (is_empty()) return String {}; + if (has_ascii_storage()) + return String::from_utf8_without_validation(bytes()); + if (!validate(allow_lonely_surrogates)) return Error::from_string_literal("Input was not valid UTF-16"); if (allow_lonely_surrogates == AllowLonelySurrogates::No) { String result; - auto utf8_length = simdutf::utf8_length_from_utf16(m_string, length_in_code_units()); + + auto utf8_length = simdutf::utf8_length_from_utf16(m_string.utf16, length_in_code_units()); TRY(result.replace_with_new_string(Badge {}, utf8_length, [&](Bytes buffer) -> ErrorOr { - [[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(m_string, length_in_code_units(), reinterpret_cast(buffer.data())); + [[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(m_string.utf16, length_in_code_units(), reinterpret_cast(buffer.data())); ASSERT(result == buffer.size()); return {}; })); @@ -127,17 +131,25 @@ ErrorOr Utf16View::to_byte_string(AllowLonelySurrogates allow_lonely bool Utf16View::is_ascii() const { + if (has_ascii_storage()) + return true; + // FIXME: Petition simdutf to implement an ASCII validator for UTF-16. - return all_of(span(), AK::is_ascii); + return all_of(utf16_span(), AK::is_ascii); } bool Utf16View::validate(size_t& valid_code_units, AllowLonelySurrogates allow_lonely_surrogates) const { + if (has_ascii_storage()) { + valid_code_units = length_in_code_units(); + return true; + } + auto view = *this; valid_code_units = 0; while (!view.is_empty()) { - auto result = simdutf::validate_utf16_with_errors(view.m_string, view.length_in_code_units()); + auto result = simdutf::validate_utf16_with_errors(view.m_string.utf16, view.length_in_code_units()); valid_code_units += result.count; if (result.error == simdutf::SUCCESS) @@ -197,7 +209,9 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod return substring_view(code_point_offset, code_point_length); auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) { - return it.m_iterator - m_string; + if (has_ascii_storage()) + return it.m_iterator.ascii - m_string.ascii; + return it.m_iterator.utf16 - m_string.utf16; }; size_t code_point_index = 0; @@ -220,9 +234,11 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod size_t Utf16View::calculate_length_in_code_points() const { + ASSERT(!has_ascii_storage()); + // simdutf's code point length method assumes valid UTF-16, whereas we allow lonely surrogates. if (validate(AllowLonelySurrogates::No)) [[likely]] - return simdutf::count_utf16(m_string, length_in_code_units()); + return simdutf::count_utf16(m_string.utf16, length_in_code_units()); size_t code_points = 0; for ([[maybe_unused]] auto code_point : *this) diff --git a/AK/Utf16View.h b/AK/Utf16View.h index 910544df74e..2818cb92cef 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -37,6 +37,13 @@ ErrorOr utf32_to_utf16(Utf32View const&); size_t utf16_code_unit_length_from_utf8(StringView); +namespace Detail { + +static constexpr inline auto UTF16_FLAG = NumericLimits::digits() - 1; +class Utf16StringBase; + +} + class Utf16CodePointIterator { friend class Utf16View; @@ -46,23 +53,35 @@ public: constexpr Utf16CodePointIterator& operator++() { - VERIFY(m_remaining_code_units > 0); + auto remaining_code_units = this->remaining_code_units(); + VERIFY(remaining_code_units > 0); - auto length = min(length_in_code_units(), m_remaining_code_units); - m_iterator += length; - m_remaining_code_units -= length; + if (has_ascii_storage()) { + ++m_iterator.ascii; + --m_remaining_code_units; + } else { + auto length = min(length_in_code_units(), remaining_code_units); + + m_iterator.utf16 += length; + m_remaining_code_units -= length; + } return *this; } constexpr u32 operator*() const { - VERIFY(m_remaining_code_units > 0); - auto code_unit = *m_iterator; + auto remaining_code_units = this->remaining_code_units(); + VERIFY(remaining_code_units > 0); + + if (has_ascii_storage()) + return *m_iterator.ascii; + + auto code_unit = *m_iterator.utf16; if (UnicodeUtils::is_utf16_high_surrogate(code_unit)) { - if (m_remaining_code_units > 1) { - auto next_code_unit = *(m_iterator + 1); + if (remaining_code_units > 1) { + auto next_code_unit = *(m_iterator.utf16 + 1); if (UnicodeUtils::is_utf16_low_surrogate(next_code_unit)) return UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit); @@ -79,22 +98,46 @@ public: [[nodiscard]] constexpr bool operator==(Utf16CodePointIterator const& other) const { - return (m_iterator == other.m_iterator) && (m_remaining_code_units == other.m_remaining_code_units); + // Note that this also protects against iterators with different underlying storage. + if (m_remaining_code_units != other.m_remaining_code_units) + return false; + + if (has_ascii_storage()) + return m_iterator.ascii == other.m_iterator.ascii; + return m_iterator.utf16 == other.m_iterator.utf16; } - [[nodiscard]] constexpr size_t length_in_code_units() const + [[nodiscard]] ALWAYS_INLINE size_t length_in_code_units() { + if (has_ascii_storage()) + return 1; return UnicodeUtils::code_unit_length_for_code_point(**this); } private: - constexpr Utf16CodePointIterator(char16_t const* ptr, size_t length) - : m_iterator(ptr) + constexpr Utf16CodePointIterator(char const* iterator, size_t length) + : m_iterator { .ascii = iterator } , m_remaining_code_units(length) { } - char16_t const* m_iterator { nullptr }; + constexpr Utf16CodePointIterator(char16_t const* iterator, size_t length) + : m_iterator { .utf16 = iterator } + , m_remaining_code_units(length) + { + m_remaining_code_units |= 1uz << Detail::UTF16_FLAG; + } + + constexpr bool has_ascii_storage() const { return m_remaining_code_units >> Detail::UTF16_FLAG == 0; } + constexpr size_t remaining_code_units() const { return m_remaining_code_units & ~(1uz << Detail::UTF16_FLAG); } + + union { + char const* ascii; + char16_t const* utf16; + } m_iterator { .ascii = nullptr }; + + // Just like Utf16StringData, we store whether this string has ASCII or UTF-16 storage by setting the most + // significant bit of m_remaining_code_units for UTF-16 storage. size_t m_remaining_code_units { 0 }; }; @@ -106,38 +149,86 @@ public: ~Utf16View() = default; constexpr Utf16View(char16_t const* string, size_t length_in_code_units) - : m_string(string) + : m_string { .utf16 = string } , m_length_in_code_units(length_in_code_units) { + m_length_in_code_units |= 1uz << Detail::UTF16_FLAG; } constexpr Utf16View(Utf16Data const& string) - : m_string(string.data()) + : m_string { .utf16 = string.data() } , m_length_in_code_units(string.size()) { + m_length_in_code_units |= 1uz << Detail::UTF16_FLAG; + } + + consteval Utf16View(StringView string) + : m_string { .ascii = string.characters_without_null_termination() } + , m_length_in_code_units(string.length()) + { + VERIFY(all_of(string, AK::is_ascii)); } Utf16View(Utf16ConversionResult&&) = delete; explicit Utf16View(Utf16ConversionResult const& conversion_result) - : m_string(conversion_result.data.data()) + : m_string { .utf16 = conversion_result.data.data() } , m_length_in_code_units(conversion_result.data.size()) , m_length_in_code_points(conversion_result.code_point_count) { + m_length_in_code_units |= 1uz << Detail::UTF16_FLAG; } ErrorOr to_utf8(AllowLonelySurrogates = AllowLonelySurrogates::Yes) const; ErrorOr to_byte_string(AllowLonelySurrogates = AllowLonelySurrogates::Yes) const; - [[nodiscard]] constexpr ReadonlySpan span() const + ALWAYS_INLINE String to_utf8_but_should_be_ported_to_utf16(AllowLonelySurrogates allow_lonely_surrogates = AllowLonelySurrogates::Yes) const { - return { m_string, length_in_code_units() }; + return MUST(to_utf8(allow_lonely_surrogates)); + } + + [[nodiscard]] ALWAYS_INLINE bool has_ascii_storage() const { return m_length_in_code_units >> Detail::UTF16_FLAG == 0; } + + [[nodiscard]] constexpr ReadonlyBytes bytes() const + { + VERIFY(has_ascii_storage()); + return { m_string.ascii, length_in_code_units() }; + } + + [[nodiscard]] constexpr ReadonlySpan ascii_span() const + { + VERIFY(has_ascii_storage()); + return { m_string.ascii, length_in_code_units() }; + } + + [[nodiscard]] constexpr ReadonlySpan utf16_span() const + { + VERIFY(!has_ascii_storage()); + return { m_string.utf16, length_in_code_units() }; } [[nodiscard]] constexpr bool operator==(Utf16View const& other) const { if (length_in_code_units() != other.length_in_code_units()) return false; - return TypedTransfer::compare(m_string, other.m_string, length_in_code_units()); + + if (has_ascii_storage() && other.has_ascii_storage()) + return TypedTransfer::compare(m_string.ascii, other.m_string.ascii, length_in_code_units()); + if (!has_ascii_storage() && !other.has_ascii_storage()) + return TypedTransfer::compare(m_string.utf16, other.m_string.utf16, length_in_code_units()); + + for (size_t i = 0; i < length_in_code_units(); ++i) { + if (code_unit_at(i) != other.code_unit_at(i)) + return false; + } + + return true; + } + + [[nodiscard]] constexpr bool operator==(StringView other) const + { + if (has_ascii_storage()) + return bytes() == other.bytes(); + return *this == Utf16View { other.characters_without_null_termination(), other.length() }; } [[nodiscard]] constexpr bool equals_ignoring_case(Utf16View const& other) const @@ -175,10 +266,18 @@ public: { if (is_empty()) return 0; - return string_hash(reinterpret_cast(m_string), length_in_code_units() * sizeof(char16_t)); + if (has_ascii_storage()) + return string_hash(m_string.ascii, length_in_code_units()); + return string_hash(reinterpret_cast(m_string.utf16), length_in_code_units() * sizeof(char16_t)); + } + + [[nodiscard]] constexpr bool is_null() const + { + if (has_ascii_storage()) + return m_string.ascii == nullptr; + return m_string.utf16 == nullptr; } - [[nodiscard]] constexpr bool is_null() const { return m_string == nullptr; } [[nodiscard]] constexpr bool is_empty() const { return length_in_code_units() == 0; } [[nodiscard]] bool is_ascii() const; @@ -190,10 +289,13 @@ public: [[nodiscard]] bool validate(size_t& valid_code_units, AllowLonelySurrogates = AllowLonelySurrogates::Yes) const; - [[nodiscard]] constexpr size_t length_in_code_units() const { return m_length_in_code_units; } + [[nodiscard]] constexpr size_t length_in_code_units() const { return m_length_in_code_units & ~(1uz << Detail::UTF16_FLAG); } [[nodiscard]] ALWAYS_INLINE size_t length_in_code_points() const { + if (has_ascii_storage()) + return m_length_in_code_units; + if (m_length_in_code_points == NumericLimits::max()) m_length_in_code_points = calculate_length_in_code_points(); return m_length_in_code_points; @@ -201,6 +303,9 @@ public: constexpr Optional length_in_code_points_if_known() const { + if (has_ascii_storage()) + return m_length_in_code_units; + if (m_length_in_code_points == NumericLimits::max()) return {}; return m_length_in_code_points; @@ -211,7 +316,10 @@ public: [[nodiscard]] constexpr char16_t code_unit_at(size_t index) const { VERIFY(index < length_in_code_units()); - return m_string[index]; + + if (has_ascii_storage()) + return m_string.ascii[index]; + return m_string.utf16[index]; } [[nodiscard]] constexpr u32 code_point_at(size_t index) const @@ -236,18 +344,25 @@ public: [[nodiscard]] constexpr Utf16CodePointIterator begin() const { - return { m_string, length_in_code_units() }; + if (has_ascii_storage()) + return { m_string.ascii, length_in_code_units() }; + return { m_string.utf16, length_in_code_units() }; } [[nodiscard]] constexpr Utf16CodePointIterator end() const { - return { m_string + length_in_code_units(), 0 }; + if (has_ascii_storage()) + return { m_string.ascii + length_in_code_units(), 0 }; + return { m_string.utf16 + length_in_code_units(), 0 }; } [[nodiscard]] constexpr Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const { VERIFY(code_unit_offset + code_unit_length <= length_in_code_units()); - return { m_string + code_unit_offset, code_unit_length }; + + if (has_ascii_storage()) + return { m_string.ascii + code_unit_offset, code_unit_length }; + return { m_string.utf16 + code_unit_offset, code_unit_length }; } [[nodiscard]] constexpr Utf16View substring_view(size_t code_unit_offset) const { return substring_view(code_unit_offset, length_in_code_units() - code_unit_offset); } @@ -259,12 +374,42 @@ public: { if (start_offset >= length_in_code_units()) return {}; - return AK::memmem_optional(m_string + start_offset, (length_in_code_units() - start_offset) * sizeof(char16_t), &needle, sizeof(needle)); + + if (has_ascii_storage()) { + if (!AK::is_ascii(needle)) + return false; + + auto byte = static_cast(needle); + return AK::memmem_optional(m_string.ascii + start_offset, length_in_code_units() - start_offset, &byte, sizeof(byte)); + } + + return AK::memmem_optional(m_string.utf16 + start_offset, (length_in_code_units() - start_offset) * sizeof(char16_t), &needle, sizeof(needle)); } constexpr Optional find_code_unit_offset(Utf16View const& needle, size_t start_offset = 0) const { - return span().index_of(needle.span(), start_offset); + if (has_ascii_storage() && needle.has_ascii_storage()) + return ascii_span().index_of(needle.ascii_span(), start_offset); + if (!has_ascii_storage() && !needle.has_ascii_storage()) + return utf16_span().index_of(needle.utf16_span(), start_offset); + + Checked maximum_offset { start_offset }; + maximum_offset += needle.length_in_code_units(); + if (maximum_offset.has_overflow() || maximum_offset.value() > length_in_code_units()) + return {}; + + if (needle.is_empty()) + return start_offset; + + for (size_t index = start_offset; index <= length_in_code_units() - needle.length_in_code_units();) { + auto slice = substring_view(index, needle.length_in_code_units()); + if (slice == needle) + return index; + + index += slice.begin().length_in_code_units(); + } + + return {}; } constexpr Optional find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset = 0) const @@ -298,9 +443,24 @@ public: if (needle.length_in_code_units() > length_in_code_units()) return false; - if (m_string == needle.m_string) - return true; - return span().starts_with(needle.span()); + if (has_ascii_storage() && needle.has_ascii_storage()) { + if (m_string.ascii == needle.m_string.ascii) + return true; + return ascii_span().starts_with(needle.ascii_span()); + } + + if (!has_ascii_storage() && !needle.has_ascii_storage()) { + if (m_string.utf16 == needle.m_string.utf16) + return true; + return utf16_span().starts_with(needle.utf16_span()); + } + + for (auto this_it = begin(), needle_it = needle.begin(); needle_it != needle.end(); ++needle_it, ++this_it) { + if (*this_it != *needle_it) + return false; + } + + return true; } // https://infra.spec.whatwg.org/#code-unit-less-than @@ -320,9 +480,24 @@ public: } private: + friend Detail::Utf16StringBase; + friend Detail::Utf16StringData; + + constexpr Utf16View(char const* string, size_t length_in_code_units) + : m_string { .ascii = string } + , m_length_in_code_units(length_in_code_units) + { + } + [[nodiscard]] size_t calculate_length_in_code_points() const; - char16_t const* m_string { nullptr }; + union { + char const* ascii; + char16_t const* utf16; + } m_string { .ascii = nullptr }; + + // Just like Utf16StringData, we store whether this string has ASCII or UTF-16 storage by setting the most + // significant bit of m_code_unit_length for UTF-16 storage. size_t m_length_in_code_units { 0 }; mutable size_t m_length_in_code_points { NumericLimits::max() }; }; @@ -342,6 +517,16 @@ struct Traits : public DefaultTraits { static unsigned hash(Utf16View const& s) { return s.hash(); } }; +namespace Detail { + +template<> +inline constexpr bool IsHashCompatible = true; + +template<> +inline constexpr bool IsHashCompatible = true; + +} + } [[nodiscard]] ALWAYS_INLINE AK_STRING_VIEW_LITERAL_CONSTEVAL AK::Utf16View operator""sv(char16_t const* string, size_t length) diff --git a/AK/Utf32View.h b/AK/Utf32View.h index b030d64b1fa..f0eff46309f 100644 --- a/AK/Utf32View.h +++ b/AK/Utf32View.h @@ -6,7 +6,9 @@ #pragma once +#include #include +#include #include #include #include @@ -99,6 +101,12 @@ public: bool is_null() const { return !m_code_points; } size_t length() const { return m_length; } + bool is_ascii() const + { + // FIXME: Petition simdutf to implement an ASCII validator for UTF-32. + return all_of(*this, AK::is_ascii); + } + size_t iterator_offset(Utf32CodePointIterator const& it) const { VERIFY(it.m_ptr >= m_code_points); diff --git a/Libraries/LibJS/Runtime/AbstractOperations.cpp b/Libraries/LibJS/Runtime/AbstractOperations.cpp index 5fea586ba6a..afa551fed07 100644 --- a/Libraries/LibJS/Runtime/AbstractOperations.cpp +++ b/Libraries/LibJS/Runtime/AbstractOperations.cpp @@ -1451,7 +1451,7 @@ ThrowCompletionOr get_substitution(VM& vm, Utf16View const& matched, Utf auto ref_length = ref.length_in_code_units(); // k. Set result to the string-concatenation of result and refReplacement. - result.append(ref_replacement.span().data(), ref_replacement.length_in_code_units()); + result.append(ref_replacement.utf16_span().data(), ref_replacement.length_in_code_units()); // j. Set templateRemainder to the substring of templateRemainder from refLength. // NOTE: We do this step last because refReplacement may point to templateRemainder. diff --git a/Libraries/LibJS/Runtime/Utf16String.cpp b/Libraries/LibJS/Runtime/Utf16String.cpp index 9acb9937fa0..637ebb151e6 100644 --- a/Libraries/LibJS/Runtime/Utf16String.cpp +++ b/Libraries/LibJS/Runtime/Utf16String.cpp @@ -44,7 +44,13 @@ NonnullRefPtr Utf16StringImpl::create(Utf16View const& view) { Utf16Data string; string.ensure_capacity(view.length_in_code_units()); - string.unchecked_append(view.span().data(), view.length_in_code_units()); + + if (view.has_ascii_storage()) { + for (size_t i = 0; i < view.length_in_code_units(); ++i) + string.unchecked_append(static_cast(view.code_unit_at(i))); + } else { + string.unchecked_append(view.utf16_span().data(), view.length_in_code_units()); + } auto impl = create(move(string)); if (auto length_in_code_points = view.length_in_code_points_if_known(); length_in_code_points.has_value()) diff --git a/Libraries/LibUnicode/Segmenter.cpp b/Libraries/LibUnicode/Segmenter.cpp index 25cfa5e59f5..113afc5c5a7 100644 --- a/Libraries/LibUnicode/Segmenter.cpp +++ b/Libraries/LibUnicode/Segmenter.cpp @@ -75,7 +75,12 @@ public: virtual void set_segmented_text(Utf16View const& text) override { - m_segmented_text = icu::UnicodeString { text.span().data(), static_cast(text.length_in_code_units()) }; + if (text.has_ascii_storage()) { + set_segmented_text(MUST(text.to_utf8())); + return; + } + + m_segmented_text = icu::UnicodeString { text.utf16_span().data(), static_cast(text.length_in_code_units()) }; m_segmenter->setText(m_segmented_text.get()); } diff --git a/Libraries/LibWeb/DOM/CharacterData.cpp b/Libraries/LibWeb/DOM/CharacterData.cpp index a2419de9a10..66d068f43f9 100644 --- a/Libraries/LibWeb/DOM/CharacterData.cpp +++ b/Libraries/LibWeb/DOM/CharacterData.cpp @@ -89,9 +89,9 @@ WebIDL::ExceptionOr CharacterData::replace_data(size_t offset, size_t coun Utf16Data full_data; full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units()); - full_data.append(before_data.span().data(), before_data.length_in_code_units()); + full_data.append(before_data.utf16_span().data(), before_data.length_in_code_units()); full_data.extend(inserted_data_result.data); - full_data.append(after_data.span().data(), after_data.length_in_code_units()); + full_data.append(after_data.utf16_span().data(), after_data.length_in_code_units()); Utf16View full_view { full_data }; bool characters_are_the_same = utf16_view == full_view; diff --git a/Tests/AK/CMakeLists.txt b/Tests/AK/CMakeLists.txt index ee87ac85819..b5690068dce 100644 --- a/Tests/AK/CMakeLists.txt +++ b/Tests/AK/CMakeLists.txt @@ -76,6 +76,7 @@ set(AK_TEST_SOURCES TestTypeTraits.cpp TestTypedTransfer.cpp TestUFixedBigInt.cpp + TestUtf16String.cpp TestUtf16View.cpp TestUtf8View.cpp TestVariant.cpp diff --git a/Tests/AK/TestUtf16String.cpp b/Tests/AK/TestUtf16String.cpp new file mode 100644 index 00000000000..a78c8ad6c4d --- /dev/null +++ b/Tests/AK/TestUtf16String.cpp @@ -0,0 +1,516 @@ +/* + * Copyright (c) 2025, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include + +#include +#include +#include +#include +#include +#include + +static Utf16String make_copy(Utf16String const& string) +{ + return string.has_ascii_storage() + ? Utf16String::from_utf8(string.ascii_view()) + : Utf16String::from_utf16(string.utf16_view()); +} + +TEST_CASE(empty_string) +{ + Utf16String string {}; + EXPECT(string.is_empty()); + EXPECT(string.is_ascii()); + EXPECT(!string.has_long_ascii_storage()); + EXPECT(string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 0uz); + EXPECT_EQ(string.length_in_code_points(), 0uz); + EXPECT_EQ(string.ascii_view(), StringView {}); +} + +TEST_CASE(from_utf8) +{ + { + auto string = Utf16String::from_utf8("hello!"sv); + EXPECT(!string.is_empty()); + EXPECT(string.is_ascii()); + EXPECT(!string.has_long_ascii_storage()); + EXPECT(string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 6uz); + EXPECT_EQ(string.length_in_code_points(), 6uz); + EXPECT_EQ(string.ascii_view(), "hello!"sv); + } + { + auto string = Utf16String::from_utf8("hello there!"sv); + EXPECT(!string.is_empty()); + EXPECT(string.is_ascii()); + EXPECT(string.has_long_ascii_storage()); + EXPECT(!string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 12uz); + EXPECT_EQ(string.length_in_code_points(), 12uz); + EXPECT_EQ(string.ascii_view(), "hello there!"sv); + } + { + auto string = Utf16String::from_utf8("😀"sv); + EXPECT(!string.is_empty()); + EXPECT(!string.is_ascii()); + EXPECT(!string.has_long_ascii_storage()); + EXPECT(!string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 2uz); + EXPECT_EQ(string.length_in_code_points(), 1uz); + EXPECT_EQ(string.utf16_view(), u"😀"sv); + } + { + auto string = Utf16String::from_utf8("hello 😀 there!"sv); + EXPECT(!string.is_empty()); + EXPECT(!string.is_ascii()); + EXPECT(!string.has_long_ascii_storage()); + EXPECT(!string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 15uz); + EXPECT_EQ(string.length_in_code_points(), 14uz); + EXPECT_EQ(string.utf16_view(), u"hello 😀 there!"sv); + } + { + auto string = Utf16String::from_utf8("hello \xed\xa0\x80!"sv); + EXPECT(!string.is_empty()); + EXPECT(!string.is_ascii()); + EXPECT(!string.has_long_ascii_storage()); + EXPECT(!string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 8uz); + EXPECT_EQ(string.length_in_code_points(), 8uz); + EXPECT_EQ(string.utf16_view(), u"hello \xd800!"sv); + } + { + auto string = Utf16String::from_utf8("hello \xed\xb0\x80!"sv); + EXPECT(!string.is_empty()); + EXPECT(!string.is_ascii()); + EXPECT(!string.has_long_ascii_storage()); + EXPECT(!string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 8uz); + EXPECT_EQ(string.length_in_code_points(), 8uz); + EXPECT_EQ(string.utf16_view(), u"hello \xdc00!"sv); + } +} + +TEST_CASE(from_utf16) +{ + { + auto string = Utf16String::from_utf16(u"hello!"sv); + EXPECT(!string.is_empty()); + EXPECT(string.is_ascii()); + EXPECT(!string.has_long_ascii_storage()); + EXPECT(string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 6uz); + EXPECT_EQ(string.length_in_code_points(), 6uz); + EXPECT_EQ(string.ascii_view(), "hello!"sv); + } + { + auto string = Utf16String::from_utf16(u"hello there!"sv); + EXPECT(!string.is_empty()); + EXPECT(string.is_ascii()); + EXPECT(string.has_long_ascii_storage()); + EXPECT(!string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 12uz); + EXPECT_EQ(string.length_in_code_points(), 12uz); + EXPECT_EQ(string.ascii_view(), "hello there!"sv); + } + { + auto string = Utf16String::from_utf16(u"😀"sv); + EXPECT(!string.is_empty()); + EXPECT(!string.is_ascii()); + EXPECT(!string.has_long_ascii_storage()); + EXPECT(!string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 2uz); + EXPECT_EQ(string.length_in_code_points(), 1uz); + EXPECT_EQ(string.utf16_view(), u"😀"sv); + } + { + auto string = Utf16String::from_utf16(u"hello 😀 there!"sv); + EXPECT(!string.is_empty()); + EXPECT(!string.is_ascii()); + EXPECT(!string.has_long_ascii_storage()); + EXPECT(!string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 15uz); + EXPECT_EQ(string.length_in_code_points(), 14uz); + EXPECT_EQ(string.utf16_view(), u"hello 😀 there!"sv); + } + { + auto string = Utf16String::from_utf16(u"hello \xd800!"sv); + EXPECT(!string.is_empty()); + EXPECT(!string.is_ascii()); + EXPECT(!string.has_long_ascii_storage()); + EXPECT(!string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 8uz); + EXPECT_EQ(string.length_in_code_points(), 8uz); + EXPECT_EQ(string.utf16_view(), u"hello \xd800!"sv); + } + { + auto string = Utf16String::from_utf16(u"hello \xdc00!"sv); + EXPECT(!string.is_empty()); + EXPECT(!string.is_ascii()); + EXPECT(!string.has_long_ascii_storage()); + EXPECT(!string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 8uz); + EXPECT_EQ(string.length_in_code_points(), 8uz); + EXPECT_EQ(string.utf16_view(), u"hello \xdc00!"sv); + } +} + +TEST_CASE(from_utf32) +{ + auto strlen32 = [](char32_t const* string) { + auto const* start = string; + while (*start) + ++start; + return static_cast(start - string); + }; + + auto to_utf32_view = [&](char32_t const* string) { + return Utf32View { reinterpret_cast(string), strlen32(string) }; + }; + + { + auto string = Utf16String::from_utf32(to_utf32_view(U"hello!")); + EXPECT(!string.is_empty()); + EXPECT(string.is_ascii()); + EXPECT(!string.has_long_ascii_storage()); + EXPECT(string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 6uz); + EXPECT_EQ(string.length_in_code_points(), 6uz); + EXPECT_EQ(string.ascii_view(), "hello!"sv); + } + { + auto string = Utf16String::from_utf32(to_utf32_view(U"hello there!")); + EXPECT(!string.is_empty()); + EXPECT(string.is_ascii()); + EXPECT(string.has_long_ascii_storage()); + EXPECT(!string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 12uz); + EXPECT_EQ(string.length_in_code_points(), 12uz); + EXPECT_EQ(string.ascii_view(), "hello there!"sv); + } + { + auto string = Utf16String::from_utf32(to_utf32_view(U"😀")); + EXPECT(!string.is_empty()); + EXPECT(!string.is_ascii()); + EXPECT(!string.has_long_ascii_storage()); + EXPECT(!string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 2uz); + EXPECT_EQ(string.length_in_code_points(), 1uz); + EXPECT_EQ(string.utf16_view(), u"😀"sv); + } + { + auto string = Utf16String::from_utf32(to_utf32_view(U"hello 😀 there!")); + EXPECT(!string.is_empty()); + EXPECT(!string.is_ascii()); + EXPECT(!string.has_long_ascii_storage()); + EXPECT(!string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 15uz); + EXPECT_EQ(string.length_in_code_points(), 14uz); + EXPECT_EQ(string.utf16_view(), u"hello 😀 there!"sv); + } + { + auto string = Utf16String::from_utf32(to_utf32_view(U"hello \xd800!")); + EXPECT(!string.is_empty()); + EXPECT(!string.is_ascii()); + EXPECT(!string.has_long_ascii_storage()); + EXPECT(!string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 8uz); + EXPECT_EQ(string.length_in_code_points(), 8uz); + EXPECT_EQ(string.utf16_view(), u"hello \xd800!"sv); + } + { + auto string = Utf16String::from_utf32(to_utf32_view(U"hello \xdc00!")); + EXPECT(!string.is_empty()); + EXPECT(!string.is_ascii()); + EXPECT(!string.has_long_ascii_storage()); + EXPECT(!string.has_short_ascii_storage()); + EXPECT_EQ(string.length_in_code_units(), 8uz); + EXPECT_EQ(string.length_in_code_points(), 8uz); + EXPECT_EQ(string.utf16_view(), u"hello \xdc00!"sv); + } +} + +TEST_CASE(copy_operations) +{ + auto test = [](Utf16String const& string1) { + auto original = make_copy(string1); + + // Copy constructor. + Utf16String string2(string1); + + EXPECT_EQ(string1, original); + EXPECT_EQ(string1, string2); + + // Copy assignment. + Utf16String string3; + string3 = string1; + + EXPECT_EQ(string1, original); + EXPECT_EQ(string1, string3); + }; + + test({}); + test("hello"_utf16); + test("hello there general!"_utf16); + test("hello 😀 there!"_utf16); +} + +TEST_CASE(move_operations) +{ + auto test = [](Utf16String string1) { + auto original = make_copy(string1); + + // Move constructor. + Utf16String string2(move(string1)); + + EXPECT(string1.is_empty()); + EXPECT_EQ(string1, Utf16String {}); + EXPECT_EQ(string2, original); + + // Move assignment. + Utf16String string3; + string3 = move(string2); + + EXPECT(string2.is_empty()); + EXPECT_EQ(string2, Utf16String {}); + EXPECT_EQ(string3, original); + }; + + test({}); + test("hello"_utf16); + test("hello there general!"_utf16); + test("hello 😀 there!"_utf16); +} + +TEST_CASE(equals) +{ + auto test = [](Utf16String const& string1, Utf16String const& inequal_string) { + auto string2 = make_copy(string1); + + EXPECT_EQ(string1, string1); + EXPECT_EQ(string1, string2); + EXPECT_EQ(string2, string1); + EXPECT_EQ(string2, string2); + + if (string1.has_long_utf16_storage()) { + EXPECT_EQ(string1, string1.utf16_view()); + EXPECT_EQ(string1, string2.utf16_view()); + EXPECT_EQ(string2, string1.utf16_view()); + EXPECT_EQ(string2, string2.utf16_view()); + + EXPECT_EQ(string1.utf16_view(), string1); + EXPECT_EQ(string1.utf16_view(), string2); + EXPECT_EQ(string2.utf16_view(), string1); + EXPECT_EQ(string2.utf16_view(), string2); + } + + EXPECT_NE(string1, inequal_string); + EXPECT_NE(string2, inequal_string); + EXPECT_NE(inequal_string, string1); + EXPECT_NE(inequal_string, string2); + + if (string1.has_long_utf16_storage()) { + EXPECT_NE(string1, inequal_string.utf16_view()); + EXPECT_NE(string2, inequal_string.utf16_view()); + EXPECT_NE(inequal_string, string1.utf16_view()); + EXPECT_NE(inequal_string, string2.utf16_view()); + + EXPECT_NE(string1.utf16_view(), inequal_string); + EXPECT_NE(string2.utf16_view(), inequal_string); + EXPECT_NE(inequal_string.utf16_view(), string1); + EXPECT_NE(inequal_string.utf16_view(), string2); + } + }; + + // Short (empty) ASCII string comparison. + test(Utf16String {}, "hello"_utf16); + + // Short ASCII string comparison. + test("hello"_utf16, "there"_utf16); + + // Short and long ASCII string comparison. + test("hello"_utf16, "hello there general!"_utf16); + + // Long ASCII string comparison. + test("hello there!"_utf16, "hello there general!"_utf16); + + // UTF-16 string comparison. + test("😀"_utf16, "hello 😀"_utf16); + + // Short ASCII and UTF-16 string comparison. + test("hello"_utf16, "😀"_utf16); + + // Short ASCII and UTF-16 string of same code unit length comparison. + test("ab"_utf16, "😀"_utf16); + + // Long ASCII and UTF-16 string comparison. + test("hello there general!"_utf16, "😀"_utf16); + + // Long ASCII and UTF-16 string of same code unit length comparison. + test("ababababab"_utf16, "😀😀😀😀😀"_utf16); +} + +TEST_CASE(equals_ascii) +{ + auto test = [](StringView ascii, Utf16String const& inequal_string) { + auto string = Utf16String::from_utf8(ascii); + + EXPECT_EQ(ascii, string); + EXPECT_EQ(string, ascii); + + EXPECT_NE(ascii, inequal_string); + EXPECT_NE(inequal_string, ascii); + }; + + // Short (empty) ASCII string comparison. + test({}, "hello"_utf16); + + // Short ASCII string comparison. + test("hello"sv, "there"_utf16); + + // Short and long ASCII string comparison. + test("hello"sv, "hello there general!"_utf16); + + // Long ASCII string comparison. + test("hello there!"sv, "hello there general!"_utf16); + + // Short ASCII and UTF-16 string comparison. + test("hello"sv, "😀"_utf16); + + // Short ASCII and UTF-16 string of same code unit length comparison. + test("ab"sv, "😀"_utf16); + + // Long ASCII and UTF-16 string comparison. + test("hello there general!"sv, "😀"_utf16); + + // Long ASCII and UTF-16 string of same code unit length comparison. + test("ababababab"sv, "😀😀😀😀😀"_utf16); + + // Non-ASCII string comparison. + EXPECT_NE("😀"sv, "😀"_utf16); +} + +TEST_CASE(equals_ignoring_ascii_case) +{ + auto test = [](Utf16String const& string1, Utf16String const& inequal_string) { + StringBuilder builder; + for (auto [i, code_point] : enumerate(string1)) + builder.append_code_point(i % 2 == 0 ? to_ascii_uppercase(code_point) : code_point); + + auto string2 = Utf16String::from_utf8(builder.string_view()); + + EXPECT(string1.equals_ignoring_ascii_case(string1)); + EXPECT(string1.equals_ignoring_ascii_case(string2)); + EXPECT(string2.equals_ignoring_ascii_case(string1)); + EXPECT(string2.equals_ignoring_ascii_case(string2)); + + if (string1.has_long_utf16_storage()) { + EXPECT(string1.equals_ignoring_ascii_case(string1.utf16_view())); + EXPECT(string1.equals_ignoring_ascii_case(string2.utf16_view())); + EXPECT(string2.equals_ignoring_ascii_case(string1.utf16_view())); + EXPECT(string2.equals_ignoring_ascii_case(string2.utf16_view())); + } + + EXPECT(!string1.equals_ignoring_ascii_case(inequal_string)); + EXPECT(!string2.equals_ignoring_ascii_case(inequal_string)); + EXPECT(!inequal_string.equals_ignoring_ascii_case(string1)); + EXPECT(!inequal_string.equals_ignoring_ascii_case(string2)); + + if (string1.has_long_utf16_storage()) { + EXPECT(!string1.equals_ignoring_ascii_case(inequal_string.utf16_view())); + EXPECT(!string2.equals_ignoring_ascii_case(inequal_string.utf16_view())); + EXPECT(!inequal_string.equals_ignoring_ascii_case(string1.utf16_view())); + EXPECT(!inequal_string.equals_ignoring_ascii_case(string2.utf16_view())); + } + }; + + // Short (empty) ASCII string comparison. + test(Utf16String {}, "hello"_utf16); + + // Short ASCII string comparison. + test("hello"_utf16, "there"_utf16); + + // Short and long ASCII string comparison. + test("hello"_utf16, "hello there general!"_utf16); + + // Long ASCII string comparison. + test("hello there!"_utf16, "hello there general!"_utf16); + + // UTF-16 string comparison. + test("😀"_utf16, "hello 😀"_utf16); + + // Short ASCII and UTF-16 string comparison. + test("hello"_utf16, "😀"_utf16); + + // Short ASCII and UTF-16 string of same code unit length comparison. + test("ab"_utf16, "😀"_utf16); + + // Long ASCII and UTF-16 string comparison. + test("hello there general!"_utf16, "😀"_utf16); + + // Long ASCII and UTF-16 string of same code unit length comparison. + test("ababababab"_utf16, "😀😀😀😀😀"_utf16); +} + +TEST_CASE(iteration) +{ + auto test = [](Utf16String const& string, ReadonlySpan code_points) { + EXPECT_EQ(string.length_in_code_points(), code_points.size()); + + for (auto [i, code_point] : enumerate(string)) { + if (code_points.size() == 0) + FAIL("Iterating an empty UTF-16 string should not produce any values"); + else + EXPECT_EQ(code_point, code_points[i]); + } + + auto iterator = string.end(); + EXPECT_DEATH("Dereferencing a UTF-16 iterator which is at its end", *iterator); + EXPECT_DEATH("Incrementing a UTF-16 iterator which is at its end", ++iterator); + }; + + test({}, {}); + test("hello"_utf16, { { 'h', 'e', 'l', 'l', 'o' } }); + test("hello there general!"_utf16, { { 'h', 'e', 'l', 'l', 'o', ' ', 't', 'h', 'e', 'r', 'e', ' ', 'g', 'e', 'n', 'e', 'r', 'a', 'l', '!' } }); + test("😀"_utf16, { { 0x1f600 } }); + test("hello 😀 there!"_utf16, { { 'h', 'e', 'l', 'l', 'o', ' ', 0x1f600, ' ', 't', 'h', 'e', 'r', 'e', '!' } }); +} + +TEST_CASE(code_unit_at) +{ + auto test = [](Utf16View const& view, size_t length_in_code_units) { + auto string = Utf16String::from_utf16(view); + EXPECT_EQ(string.length_in_code_units(), length_in_code_units); + + for (size_t i = 0; i < length_in_code_units; ++i) + EXPECT_EQ(string.code_unit_at(i), view.code_unit_at(i)); + }; + + test({}, 0); + test(u"hello"sv, 5); + test(u"hello there general!"sv, 20); + test(u"😀"sv, 2); + test(u"hello 😀 there!"sv, 15); +} + +TEST_CASE(code_point_at) +{ + auto test = [](Utf16View const& view, size_t length_in_code_points) { + auto string = Utf16String::from_utf16(view); + EXPECT_EQ(string.length_in_code_points(), length_in_code_points); + + for (size_t i = 0; i < string.length_in_code_units(); ++i) + EXPECT_EQ(string.code_point_at(i), view.code_point_at(i)); + }; + + test({}, 0); + test(u"hello"sv, 5); + test(u"hello there general!"sv, 20); + test(u"😀"sv, 1); + test(u"hello 😀 there!"sv, 14); +}