diff --git a/AK/CMakeLists.txt b/AK/CMakeLists.txt index d4cf43596cc..750ce4b0060 100644 --- a/AK/CMakeLists.txt +++ b/AK/CMakeLists.txt @@ -29,6 +29,7 @@ set(SOURCES StringUtils.cpp StringView.cpp Time.cpp + Utf16FlyString.cpp Utf16String.cpp Utf16StringData.cpp Utf16View.cpp diff --git a/AK/Forward.h b/AK/Forward.h index 4c602156d49..f588e70bd60 100644 --- a/AK/Forward.h +++ b/AK/Forward.h @@ -53,6 +53,7 @@ class String; class StringBuilder; class StringView; class UnixDateTime; +class Utf16FlyString; class Utf16String; class Utf16View; class Utf32CodePointIterator; @@ -200,6 +201,7 @@ using AK::StringView; using AK::TrailingCodePointTransformation; using AK::Traits; using AK::UnixDateTime; +using AK::Utf16FlyString; using AK::Utf16String; using AK::Utf16View; using AK::Utf32CodePointIterator; diff --git a/AK/Utf16FlyString.cpp b/AK/Utf16FlyString.cpp new file mode 100644 index 00000000000..31c616b4296 --- /dev/null +++ b/AK/Utf16FlyString.cpp @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2025, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include +#include + +namespace AK { + +struct Utf16FlyStringTableHashTraits : public Traits { + static u32 hash(Detail::Utf16StringData const* string) { return string->hash(); } + static bool equals(Detail::Utf16StringData const* a, Detail::Utf16StringData const* b) { return *a == *b; } +}; + +static auto& all_utf16_fly_strings() +{ + static Singleton> table; + return *table; +} + +namespace Detail { + +void did_destroy_utf16_fly_string_data(Badge, Detail::Utf16StringData const& data) +{ + all_utf16_fly_strings().remove(&data); +} + +} + +template +Optional Utf16FlyString::create_fly_string_from_cache(ViewType const& string) +{ + if (string.is_empty()) + return {}; + + if constexpr (IsSame) { + if (string.length() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && string.is_ascii()) + return Utf16String::from_utf8_without_validation(string); + } else { + if (string.length_in_code_units() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && string.is_ascii()) + return Utf16String::from_utf16_without_validation(string); + } + + if (auto it = all_utf16_fly_strings().find(string.hash(), [&](auto const& entry) { return *entry == string; }); it != all_utf16_fly_strings().end()) + return Utf16FlyString { Detail::Utf16StringBase(**it) }; + + return {}; +} + +Utf16FlyString Utf16FlyString::from_utf8(StringView string) +{ + if (auto result = create_fly_string_from_cache(string); result.has_value()) + return result.release_value(); + return Utf16String::from_utf8(string); +} + +Utf16FlyString Utf16FlyString::from_utf8_without_validation(StringView string) +{ + if (auto result = create_fly_string_from_cache(string); result.has_value()) + return result.release_value(); + return Utf16String::from_utf8_without_validation(string); +} + +Utf16FlyString Utf16FlyString::from_utf16(Utf16View const& string) +{ + if (auto result = create_fly_string_from_cache(string); result.has_value()) + return result.release_value(); + return Utf16String::from_utf16(string); +} + +Utf16FlyString Utf16FlyString::from_utf16_without_validation(Utf16View const& string) +{ + if (auto result = create_fly_string_from_cache(string); result.has_value()) + return result.release_value(); + return Utf16String::from_utf16_without_validation(string); +} + +Utf16FlyString::Utf16FlyString(Utf16String const& string) +{ + if (string.has_short_ascii_storage()) { + m_data = string; + return; + } + + auto const* data = string.data({}); + + if (data->is_fly_string()) { + m_data = string; + return; + } + + if (auto it = all_utf16_fly_strings().find(data); it == all_utf16_fly_strings().end()) { + m_data = string; + + all_utf16_fly_strings().set(data); + data->mark_as_fly_string({}); + } else { + m_data.set_data({}, *it); + } +} + +size_t Utf16FlyString::number_of_utf16_fly_strings() +{ + return all_utf16_fly_strings().size(); +} + +} diff --git a/AK/Utf16FlyString.h b/AK/Utf16FlyString.h new file mode 100644 index 00000000000..15a1c74d63c --- /dev/null +++ b/AK/Utf16FlyString.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2025, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include +#include + +namespace AK { + +class [[nodiscard]] Utf16FlyString { + AK_MAKE_DEFAULT_MOVABLE(Utf16FlyString); + AK_MAKE_DEFAULT_COPYABLE(Utf16FlyString); + +public: + constexpr Utf16FlyString() = default; + + static Utf16FlyString from_utf8(StringView); + static Utf16FlyString from_utf8_without_validation(StringView); + static Utf16FlyString from_utf8_but_should_be_ported_to_utf16(StringView string) { return from_utf8_without_validation(string); } + + static Utf16FlyString from_utf16(Utf16View const&); + static Utf16FlyString from_utf16_without_validation(Utf16View const&); + + template + requires(IsOneOf, Utf16String, Utf16FlyString>) + static Utf16FlyString from_utf16(T&&) = delete; + + Utf16FlyString(Utf16String const&); + + [[nodiscard]] ALWAYS_INLINE Utf16View view() const { return m_data.utf16_view(); } + + ALWAYS_INLINE explicit operator Utf16String() const { return to_utf16_string(); } + + ALWAYS_INLINE Utf16String to_utf16_string() const + { + Detail::Utf16StringBase copy { m_data }; + return Utf16String { move(copy) }; + } + + ALWAYS_INLINE Utf16FlyString& operator=(Utf16String const& string) + { + *this = Utf16FlyString { string }; + return *this; + } + + [[nodiscard]] ALWAYS_INLINE bool operator==(Utf16FlyString const& other) const { return m_data.raw({}) == other.m_data.raw({}); } + [[nodiscard]] ALWAYS_INLINE bool operator==(Utf16String const& other) const { return m_data == other; } + [[nodiscard]] ALWAYS_INLINE bool operator==(Utf16View const& other) const { return m_data == other; } + [[nodiscard]] ALWAYS_INLINE bool operator==(StringView other) const { return m_data == other; } + + [[nodiscard]] ALWAYS_INLINE bool equals_ignoring_ascii_case(Utf16FlyString const& other) const + { + if (*this == other) + return true; + return m_data.equals_ignoring_ascii_case(other.m_data); + } + + [[nodiscard]] ALWAYS_INLINE bool equals_ignoring_ascii_case(Utf16View const& other) const { return m_data.equals_ignoring_ascii_case(other); } + + template + [[nodiscard]] ALWAYS_INLINE bool is_one_of(Ts&&... strings) const + { + return (this->operator==(forward(strings)) || ...); + } + + template + [[nodiscard]] ALWAYS_INLINE bool is_one_of_ignoring_ascii_case(Ts&&... strings) const + { + return (this->equals_ignoring_ascii_case(forward(strings)) || ...); + } + + [[nodiscard]] ALWAYS_INLINE u32 hash() const { return m_data.hash(); } + [[nodiscard]] ALWAYS_INLINE bool is_empty() const { return m_data.is_empty(); } + [[nodiscard]] ALWAYS_INLINE bool is_ascii() const { return m_data.is_ascii(); } + + [[nodiscard]] ALWAYS_INLINE size_t length_in_code_units() const { return m_data.length_in_code_units(); } + [[nodiscard]] ALWAYS_INLINE size_t length_in_code_points() const { return m_data.length_in_code_points(); } + + [[nodiscard]] ALWAYS_INLINE char16_t code_unit_at(size_t code_unit_offset) const { return m_data.code_unit_at(code_unit_offset); } + [[nodiscard]] ALWAYS_INLINE u32 code_point_at(size_t code_unit_offset) const { return m_data.code_point_at(code_unit_offset); } + + [[nodiscard]] ALWAYS_INLINE size_t code_unit_offset_of(size_t code_point_offset) const { return m_data.code_unit_offset_of(code_point_offset); } + [[nodiscard]] ALWAYS_INLINE size_t code_point_offset_of(size_t code_unit_offset) const { return m_data.code_point_offset_of(code_unit_offset); } + + // This is primarily interesting to unit tests. + [[nodiscard]] static size_t number_of_utf16_fly_strings(); + +private: + ALWAYS_INLINE explicit Utf16FlyString(Detail::Utf16StringBase data) + : m_data(move(data)) + { + } + + template + static Optional create_fly_string_from_cache(ViewType const&); + + Detail::Utf16StringBase m_data; +}; + +template<> +struct Traits : public DefaultTraits { + static unsigned hash(Utf16FlyString const& string) { return string.hash(); } +}; + +template<> +struct Formatter : Formatter { + ErrorOr format(FormatBuilder& builder, Utf16FlyString const& string) + { + return Formatter::format(builder, string.to_utf16_string()); + } +}; + +} + +[[nodiscard]] ALWAYS_INLINE AK::Utf16FlyString operator""_utf16_fly_string(char const* string, size_t length) +{ + AK::StringView view { string, length }; + + ASSERT(AK::Utf8View { view }.validate()); + return AK::Utf16FlyString::from_utf8_without_validation(view); +} + +[[nodiscard]] ALWAYS_INLINE AK::Utf16FlyString operator""_utf16_fly_string(char16_t const* string, size_t length) +{ + AK::Utf16View view { string, length }; + + ASSERT(view.validate()); + return AK::Utf16FlyString::from_utf16_without_validation(view); +} diff --git a/AK/Utf16String.h b/AK/Utf16String.h index 1551e068360..040165d667e 100644 --- a/AK/Utf16String.h +++ b/AK/Utf16String.h @@ -70,15 +70,15 @@ public: static Utf16String from_utf32(Utf32View const&); template - requires(IsOneOf, Utf16String>) + requires(IsOneOf, Utf16String, Utf16FlyString>) static Utf16String from_utf16(T&&) = delete; template - requires(IsOneOf, Utf16String>) + requires(IsOneOf, Utf16String, Utf16FlyString>) static ErrorOr try_from_utf16(T&&) = delete; template - requires(IsOneOf, Utf16String>) + requires(IsOneOf, Utf16String, Utf16FlyString>) static Utf16String from_utf16_without_validation(T&&) = delete; template diff --git a/AK/Utf16StringBase.h b/AK/Utf16StringBase.h index 31b20fde526..6f33788b3e9 100644 --- a/AK/Utf16StringBase.h +++ b/AK/Utf16StringBase.h @@ -248,6 +248,21 @@ public: return !has_short_ascii_storage(); } + [[nodiscard]] ALWAYS_INLINE Utf16StringData const* data(Badge) const + { + VERIFY(has_long_storage()); + return data_without_union_member_assertion(); + } + + ALWAYS_INLINE void set_data(Badge, Utf16StringData const* data) + { + auto const** this_data = __builtin_launder(&m_value.data); + (*this_data) = data; + (*this_data)->ref(); + } + + [[nodiscard]] constexpr FlatPtr raw(Badge) const { return bit_cast(m_value); } + protected: ALWAYS_INLINE void destroy_string() const { diff --git a/AK/Utf16StringData.h b/AK/Utf16StringData.h index 30aa262063d..18a2458c37e 100644 --- a/AK/Utf16StringData.h +++ b/AK/Utf16StringData.h @@ -16,6 +16,8 @@ namespace AK::Detail { +void did_destroy_utf16_fly_string_data(Badge, Detail::Utf16StringData const&); + class Utf16StringData final : public RefCounted { public: enum class StorageType : u8 { @@ -33,7 +35,11 @@ public: static NonnullRefPtr from_utf32(Utf32View const&); static NonnullRefPtr from_string_builder(StringBuilder&); - ~Utf16StringData() = default; + ~Utf16StringData() + { + if (is_fly_string()) + did_destroy_utf16_fly_string_data({}, *this); + } [[nodiscard]] static constexpr size_t offset_of_string_storage() { @@ -47,6 +53,8 @@ public: [[nodiscard]] ALWAYS_INLINE bool operator==(Utf16StringData const& other) const { + if (is_fly_string() && other.is_fly_string()) + return this == &other; return utf16_view() == other.utf16_view(); } @@ -99,6 +107,9 @@ public: return view; } + ALWAYS_INLINE void mark_as_fly_string(Badge) const { m_is_fly_string = true; } + [[nodiscard]] ALWAYS_INLINE bool is_fly_string() const { return m_is_fly_string; } + private: ALWAYS_INLINE Utf16StringData(StorageType storage_type, size_t code_unit_length) : m_length_in_code_units(code_unit_length) @@ -130,6 +141,8 @@ private: mutable u32 m_hash { 0 }; mutable bool m_has_hash { false }; + mutable bool m_is_fly_string { false }; + union { char m_ascii_data[0]; char16_t m_utf16_data[0]; diff --git a/Tests/AK/CMakeLists.txt b/Tests/AK/CMakeLists.txt index b5690068dce..dc50ac217f8 100644 --- a/Tests/AK/CMakeLists.txt +++ b/Tests/AK/CMakeLists.txt @@ -76,6 +76,7 @@ set(AK_TEST_SOURCES TestTypeTraits.cpp TestTypedTransfer.cpp TestUFixedBigInt.cpp + TestUtf16FlyString.cpp TestUtf16String.cpp TestUtf16View.cpp TestUtf8View.cpp diff --git a/Tests/AK/TestUtf16FlyString.cpp b/Tests/AK/TestUtf16FlyString.cpp new file mode 100644 index 00000000000..44fd4c37e38 --- /dev/null +++ b/Tests/AK/TestUtf16FlyString.cpp @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2025, Tim Flynn + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include + +#include + +TEST_CASE(empty_string) +{ + Utf16FlyString fly {}; + EXPECT(fly.is_empty()); + EXPECT_EQ(fly, ""sv); + + // Short strings do not get stored in the fly string table. + EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 0u); +} + +TEST_CASE(short_string) +{ + Utf16FlyString fly1 { "foo"_utf16 }; + EXPECT_EQ(fly1, "foo"sv); + + Utf16FlyString fly2 { "foo"_utf16 }; + EXPECT_EQ(fly2, "foo"sv); + + Utf16FlyString fly3 { "bar"_utf16 }; + EXPECT_EQ(fly3, "bar"sv); + + EXPECT_EQ(fly1, fly2); + EXPECT_NE(fly1, fly3); + EXPECT_NE(fly2, fly3); + + EXPECT(fly1.to_utf16_string().has_short_ascii_storage()); + EXPECT(fly2.to_utf16_string().has_short_ascii_storage()); + EXPECT(fly3.to_utf16_string().has_short_ascii_storage()); + + // Short strings do not get stored in the fly string table. + EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 0u); +} + +TEST_CASE(long_string) +{ + Utf16FlyString fly1 { "thisisdefinitelymorethan7bytes"_utf16 }; + EXPECT_EQ(fly1, "thisisdefinitelymorethan7bytes"sv); + EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u); + + Utf16FlyString fly2 { "thisisdefinitelymorethan7bytes"_utf16 }; + EXPECT_EQ(fly2, "thisisdefinitelymorethan7bytes"sv); + EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u); + + Utf16FlyString fly3 { "thisisalsoforsuremorethan7bytes"_utf16 }; + EXPECT_EQ(fly3, "thisisalsoforsuremorethan7bytes"sv); + EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 2u); + + EXPECT_EQ(fly1, fly2); + EXPECT_NE(fly1, fly3); + EXPECT_NE(fly2, fly3); + + EXPECT(fly1.to_utf16_string().has_long_ascii_storage()); + EXPECT(fly2.to_utf16_string().has_long_ascii_storage()); + EXPECT(fly3.to_utf16_string().has_long_ascii_storage()); +} + +TEST_CASE(user_defined_literal) +{ + auto fly1 = "thisisdefinitelymorethan7bytes"_utf16_fly_string; + EXPECT_EQ(fly1, "thisisdefinitelymorethan7bytes"sv); + EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u); + + auto fly2 = "thisisdefinitelymorethan7bytes"_utf16_fly_string; + EXPECT_EQ(fly2, "thisisdefinitelymorethan7bytes"sv); + EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u); + + auto fly3 = u"thisisdefinitelymorethan7bytes"_utf16_fly_string; + EXPECT_EQ(fly3, u"thisisdefinitelymorethan7bytes"sv); + EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u); + + auto fly4 = "foo"_utf16_fly_string; + EXPECT_EQ(fly4, "foo"sv); + EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u); + + EXPECT_EQ(fly1, fly2); + EXPECT_EQ(fly1, fly3); + EXPECT_EQ(fly3, fly3); + + EXPECT_NE(fly1, fly4); + EXPECT_NE(fly2, fly4); + EXPECT_NE(fly3, fly4); +} + +TEST_CASE(fly_string_keep_string_data_alive) +{ + EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 0u); + { + Utf16FlyString fly {}; + { + auto string = "thisisdefinitelymorethan7bytes"_utf16; + fly = Utf16FlyString { string }; + EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u); + } + + EXPECT_EQ(fly, "thisisdefinitelymorethan7bytes"sv); + EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u); + } + + EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 0u); +} + +TEST_CASE(moved_fly_string_becomes_empty) +{ + Utf16FlyString fly1 {}; + EXPECT(fly1.is_empty()); + + Utf16FlyString fly2 { "thisisdefinitelymorethan7bytes"_utf16 }; + EXPECT_EQ(fly2, "thisisdefinitelymorethan7bytes"sv); + EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u); + + fly1 = move(fly2); + + EXPECT(fly2.is_empty()); + EXPECT_EQ(fly1, "thisisdefinitelymorethan7bytes"sv); + EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u); +} + +TEST_CASE(is_one_of) +{ + auto foo = Utf16FlyString::from_utf8("foo"sv); + auto bar = Utf16FlyString::from_utf16(u"bar"sv); + + EXPECT(foo.is_one_of(foo)); + EXPECT(foo.is_one_of(foo, bar)); + EXPECT(foo.is_one_of(bar, foo)); + EXPECT(!foo.is_one_of(bar)); + + EXPECT(!bar.is_one_of("foo"sv)); + EXPECT(bar.is_one_of("foo"sv, "bar"sv)); + EXPECT(bar.is_one_of("bar"sv, "foo"sv)); + EXPECT(bar.is_one_of("bar"sv)); +}