AK: Implement a flyweight string for Utf16String

Utf16FlyString more or less works exactly the same as FlyString. It will
store the raw encoded data of the string instance. If the string is a
short ASCII string, Utf16FlyString holds the ShortString bytes; else,
Utf16FlyString holds a pointer to the Utf16StringData.
This commit is contained in:
Timothy Flynn 2025-06-20 11:52:35 -04:00 committed by Tim Flynn
commit 7f069efbc4
Notes: github-actions[bot] 2025-07-18 16:47:19 +00:00
9 changed files with 423 additions and 4 deletions

View file

@ -29,6 +29,7 @@ set(SOURCES
StringUtils.cpp StringUtils.cpp
StringView.cpp StringView.cpp
Time.cpp Time.cpp
Utf16FlyString.cpp
Utf16String.cpp Utf16String.cpp
Utf16StringData.cpp Utf16StringData.cpp
Utf16View.cpp Utf16View.cpp

View file

@ -53,6 +53,7 @@ class String;
class StringBuilder; class StringBuilder;
class StringView; class StringView;
class UnixDateTime; class UnixDateTime;
class Utf16FlyString;
class Utf16String; class Utf16String;
class Utf16View; class Utf16View;
class Utf32CodePointIterator; class Utf32CodePointIterator;
@ -200,6 +201,7 @@ using AK::StringView;
using AK::TrailingCodePointTransformation; using AK::TrailingCodePointTransformation;
using AK::Traits; using AK::Traits;
using AK::UnixDateTime; using AK::UnixDateTime;
using AK::Utf16FlyString;
using AK::Utf16String; using AK::Utf16String;
using AK::Utf16View; using AK::Utf16View;
using AK::Utf32CodePointIterator; using AK::Utf32CodePointIterator;

110
AK/Utf16FlyString.cpp Normal file
View file

@ -0,0 +1,110 @@
/*
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/HashTable.h>
#include <AK/Singleton.h>
#include <AK/Utf16FlyString.h>
namespace AK {
struct Utf16FlyStringTableHashTraits : public Traits<Detail::Utf16StringData const*> {
static u32 hash(Detail::Utf16StringData const* string) { return string->hash(); }
static bool equals(Detail::Utf16StringData const* a, Detail::Utf16StringData const* b) { return *a == *b; }
};
static auto& all_utf16_fly_strings()
{
static Singleton<HashTable<Detail::Utf16StringData const*, Utf16FlyStringTableHashTraits>> table;
return *table;
}
namespace Detail {
void did_destroy_utf16_fly_string_data(Badge<Detail::Utf16StringData>, Detail::Utf16StringData const& data)
{
all_utf16_fly_strings().remove(&data);
}
}
template<typename ViewType>
Optional<Utf16FlyString> Utf16FlyString::create_fly_string_from_cache(ViewType const& string)
{
if (string.is_empty())
return {};
if constexpr (IsSame<ViewType, StringView>) {
if (string.length() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && string.is_ascii())
return Utf16String::from_utf8_without_validation(string);
} else {
if (string.length_in_code_units() <= Detail::MAX_SHORT_STRING_BYTE_COUNT && string.is_ascii())
return Utf16String::from_utf16_without_validation(string);
}
if (auto it = all_utf16_fly_strings().find(string.hash(), [&](auto const& entry) { return *entry == string; }); it != all_utf16_fly_strings().end())
return Utf16FlyString { Detail::Utf16StringBase(**it) };
return {};
}
Utf16FlyString Utf16FlyString::from_utf8(StringView string)
{
if (auto result = create_fly_string_from_cache(string); result.has_value())
return result.release_value();
return Utf16String::from_utf8(string);
}
Utf16FlyString Utf16FlyString::from_utf8_without_validation(StringView string)
{
if (auto result = create_fly_string_from_cache(string); result.has_value())
return result.release_value();
return Utf16String::from_utf8_without_validation(string);
}
Utf16FlyString Utf16FlyString::from_utf16(Utf16View const& string)
{
if (auto result = create_fly_string_from_cache(string); result.has_value())
return result.release_value();
return Utf16String::from_utf16(string);
}
Utf16FlyString Utf16FlyString::from_utf16_without_validation(Utf16View const& string)
{
if (auto result = create_fly_string_from_cache(string); result.has_value())
return result.release_value();
return Utf16String::from_utf16_without_validation(string);
}
Utf16FlyString::Utf16FlyString(Utf16String const& string)
{
if (string.has_short_ascii_storage()) {
m_data = string;
return;
}
auto const* data = string.data({});
if (data->is_fly_string()) {
m_data = string;
return;
}
if (auto it = all_utf16_fly_strings().find(data); it == all_utf16_fly_strings().end()) {
m_data = string;
all_utf16_fly_strings().set(data);
data->mark_as_fly_string({});
} else {
m_data.set_data({}, *it);
}
}
size_t Utf16FlyString::number_of_utf16_fly_strings()
{
return all_utf16_fly_strings().size();
}
}

135
AK/Utf16FlyString.h Normal file
View file

@ -0,0 +1,135 @@
/*
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Format.h>
#include <AK/Optional.h>
#include <AK/Traits.h>
#include <AK/Utf16String.h>
namespace AK {
class [[nodiscard]] Utf16FlyString {
AK_MAKE_DEFAULT_MOVABLE(Utf16FlyString);
AK_MAKE_DEFAULT_COPYABLE(Utf16FlyString);
public:
constexpr Utf16FlyString() = default;
static Utf16FlyString from_utf8(StringView);
static Utf16FlyString from_utf8_without_validation(StringView);
static Utf16FlyString from_utf8_but_should_be_ported_to_utf16(StringView string) { return from_utf8_without_validation(string); }
static Utf16FlyString from_utf16(Utf16View const&);
static Utf16FlyString from_utf16_without_validation(Utf16View const&);
template<typename T>
requires(IsOneOf<RemoveCVReference<T>, Utf16String, Utf16FlyString>)
static Utf16FlyString from_utf16(T&&) = delete;
Utf16FlyString(Utf16String const&);
[[nodiscard]] ALWAYS_INLINE Utf16View view() const { return m_data.utf16_view(); }
ALWAYS_INLINE explicit operator Utf16String() const { return to_utf16_string(); }
ALWAYS_INLINE Utf16String to_utf16_string() const
{
Detail::Utf16StringBase copy { m_data };
return Utf16String { move(copy) };
}
ALWAYS_INLINE Utf16FlyString& operator=(Utf16String const& string)
{
*this = Utf16FlyString { string };
return *this;
}
[[nodiscard]] ALWAYS_INLINE bool operator==(Utf16FlyString const& other) const { return m_data.raw({}) == other.m_data.raw({}); }
[[nodiscard]] ALWAYS_INLINE bool operator==(Utf16String const& other) const { return m_data == other; }
[[nodiscard]] ALWAYS_INLINE bool operator==(Utf16View const& other) const { return m_data == other; }
[[nodiscard]] ALWAYS_INLINE bool operator==(StringView other) const { return m_data == other; }
[[nodiscard]] ALWAYS_INLINE bool equals_ignoring_ascii_case(Utf16FlyString const& other) const
{
if (*this == other)
return true;
return m_data.equals_ignoring_ascii_case(other.m_data);
}
[[nodiscard]] ALWAYS_INLINE bool equals_ignoring_ascii_case(Utf16View const& other) const { return m_data.equals_ignoring_ascii_case(other); }
template<typename... Ts>
[[nodiscard]] ALWAYS_INLINE bool is_one_of(Ts&&... strings) const
{
return (this->operator==(forward<Ts>(strings)) || ...);
}
template<typename... Ts>
[[nodiscard]] ALWAYS_INLINE bool is_one_of_ignoring_ascii_case(Ts&&... strings) const
{
return (this->equals_ignoring_ascii_case(forward<Ts>(strings)) || ...);
}
[[nodiscard]] ALWAYS_INLINE u32 hash() const { return m_data.hash(); }
[[nodiscard]] ALWAYS_INLINE bool is_empty() const { return m_data.is_empty(); }
[[nodiscard]] ALWAYS_INLINE bool is_ascii() const { return m_data.is_ascii(); }
[[nodiscard]] ALWAYS_INLINE size_t length_in_code_units() const { return m_data.length_in_code_units(); }
[[nodiscard]] ALWAYS_INLINE size_t length_in_code_points() const { return m_data.length_in_code_points(); }
[[nodiscard]] ALWAYS_INLINE char16_t code_unit_at(size_t code_unit_offset) const { return m_data.code_unit_at(code_unit_offset); }
[[nodiscard]] ALWAYS_INLINE u32 code_point_at(size_t code_unit_offset) const { return m_data.code_point_at(code_unit_offset); }
[[nodiscard]] ALWAYS_INLINE size_t code_unit_offset_of(size_t code_point_offset) const { return m_data.code_unit_offset_of(code_point_offset); }
[[nodiscard]] ALWAYS_INLINE size_t code_point_offset_of(size_t code_unit_offset) const { return m_data.code_point_offset_of(code_unit_offset); }
// This is primarily interesting to unit tests.
[[nodiscard]] static size_t number_of_utf16_fly_strings();
private:
ALWAYS_INLINE explicit Utf16FlyString(Detail::Utf16StringBase data)
: m_data(move(data))
{
}
template<typename ViewType>
static Optional<Utf16FlyString> create_fly_string_from_cache(ViewType const&);
Detail::Utf16StringBase m_data;
};
template<>
struct Traits<Utf16FlyString> : public DefaultTraits<Utf16FlyString> {
static unsigned hash(Utf16FlyString const& string) { return string.hash(); }
};
template<>
struct Formatter<Utf16FlyString> : Formatter<Utf16String> {
ErrorOr<void> format(FormatBuilder& builder, Utf16FlyString const& string)
{
return Formatter<Utf16String>::format(builder, string.to_utf16_string());
}
};
}
[[nodiscard]] ALWAYS_INLINE AK::Utf16FlyString operator""_utf16_fly_string(char const* string, size_t length)
{
AK::StringView view { string, length };
ASSERT(AK::Utf8View { view }.validate());
return AK::Utf16FlyString::from_utf8_without_validation(view);
}
[[nodiscard]] ALWAYS_INLINE AK::Utf16FlyString operator""_utf16_fly_string(char16_t const* string, size_t length)
{
AK::Utf16View view { string, length };
ASSERT(view.validate());
return AK::Utf16FlyString::from_utf16_without_validation(view);
}

View file

@ -70,15 +70,15 @@ public:
static Utf16String from_utf32(Utf32View const&); static Utf16String from_utf32(Utf32View const&);
template<typename T> template<typename T>
requires(IsOneOf<RemoveCVReference<T>, Utf16String>) requires(IsOneOf<RemoveCVReference<T>, Utf16String, Utf16FlyString>)
static Utf16String from_utf16(T&&) = delete; static Utf16String from_utf16(T&&) = delete;
template<typename T> template<typename T>
requires(IsOneOf<RemoveCVReference<T>, Utf16String>) requires(IsOneOf<RemoveCVReference<T>, Utf16String, Utf16FlyString>)
static ErrorOr<Utf16String> try_from_utf16(T&&) = delete; static ErrorOr<Utf16String> try_from_utf16(T&&) = delete;
template<typename T> template<typename T>
requires(IsOneOf<RemoveCVReference<T>, Utf16String>) requires(IsOneOf<RemoveCVReference<T>, Utf16String, Utf16FlyString>)
static Utf16String from_utf16_without_validation(T&&) = delete; static Utf16String from_utf16_without_validation(T&&) = delete;
template<typename... Parameters> template<typename... Parameters>

View file

@ -248,6 +248,21 @@ public:
return !has_short_ascii_storage(); return !has_short_ascii_storage();
} }
[[nodiscard]] ALWAYS_INLINE Utf16StringData const* data(Badge<Utf16FlyString>) const
{
VERIFY(has_long_storage());
return data_without_union_member_assertion();
}
ALWAYS_INLINE void set_data(Badge<Utf16FlyString>, Utf16StringData const* data)
{
auto const** this_data = __builtin_launder(&m_value.data);
(*this_data) = data;
(*this_data)->ref();
}
[[nodiscard]] constexpr FlatPtr raw(Badge<Utf16FlyString>) const { return bit_cast<FlatPtr>(m_value); }
protected: protected:
ALWAYS_INLINE void destroy_string() const ALWAYS_INLINE void destroy_string() const
{ {

View file

@ -16,6 +16,8 @@
namespace AK::Detail { namespace AK::Detail {
void did_destroy_utf16_fly_string_data(Badge<Detail::Utf16StringData>, Detail::Utf16StringData const&);
class Utf16StringData final : public RefCounted<Utf16StringData> { class Utf16StringData final : public RefCounted<Utf16StringData> {
public: public:
enum class StorageType : u8 { enum class StorageType : u8 {
@ -33,7 +35,11 @@ public:
static NonnullRefPtr<Utf16StringData> from_utf32(Utf32View const&); static NonnullRefPtr<Utf16StringData> from_utf32(Utf32View const&);
static NonnullRefPtr<Utf16StringData> from_string_builder(StringBuilder&); static NonnullRefPtr<Utf16StringData> from_string_builder(StringBuilder&);
~Utf16StringData() = default; ~Utf16StringData()
{
if (is_fly_string())
did_destroy_utf16_fly_string_data({}, *this);
}
[[nodiscard]] static constexpr size_t offset_of_string_storage() [[nodiscard]] static constexpr size_t offset_of_string_storage()
{ {
@ -47,6 +53,8 @@ public:
[[nodiscard]] ALWAYS_INLINE bool operator==(Utf16StringData const& other) const [[nodiscard]] ALWAYS_INLINE bool operator==(Utf16StringData const& other) const
{ {
if (is_fly_string() && other.is_fly_string())
return this == &other;
return utf16_view() == other.utf16_view(); return utf16_view() == other.utf16_view();
} }
@ -99,6 +107,9 @@ public:
return view; return view;
} }
ALWAYS_INLINE void mark_as_fly_string(Badge<Utf16FlyString>) const { m_is_fly_string = true; }
[[nodiscard]] ALWAYS_INLINE bool is_fly_string() const { return m_is_fly_string; }
private: private:
ALWAYS_INLINE Utf16StringData(StorageType storage_type, size_t code_unit_length) ALWAYS_INLINE Utf16StringData(StorageType storage_type, size_t code_unit_length)
: m_length_in_code_units(code_unit_length) : m_length_in_code_units(code_unit_length)
@ -130,6 +141,8 @@ private:
mutable u32 m_hash { 0 }; mutable u32 m_hash { 0 };
mutable bool m_has_hash { false }; mutable bool m_has_hash { false };
mutable bool m_is_fly_string { false };
union { union {
char m_ascii_data[0]; char m_ascii_data[0];
char16_t m_utf16_data[0]; char16_t m_utf16_data[0];

View file

@ -76,6 +76,7 @@ set(AK_TEST_SOURCES
TestTypeTraits.cpp TestTypeTraits.cpp
TestTypedTransfer.cpp TestTypedTransfer.cpp
TestUFixedBigInt.cpp TestUFixedBigInt.cpp
TestUtf16FlyString.cpp
TestUtf16String.cpp TestUtf16String.cpp
TestUtf16View.cpp TestUtf16View.cpp
TestUtf8View.cpp TestUtf8View.cpp

View file

@ -0,0 +1,142 @@
/*
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibTest/TestCase.h>
#include <AK/Utf16FlyString.h>
TEST_CASE(empty_string)
{
Utf16FlyString fly {};
EXPECT(fly.is_empty());
EXPECT_EQ(fly, ""sv);
// Short strings do not get stored in the fly string table.
EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 0u);
}
TEST_CASE(short_string)
{
Utf16FlyString fly1 { "foo"_utf16 };
EXPECT_EQ(fly1, "foo"sv);
Utf16FlyString fly2 { "foo"_utf16 };
EXPECT_EQ(fly2, "foo"sv);
Utf16FlyString fly3 { "bar"_utf16 };
EXPECT_EQ(fly3, "bar"sv);
EXPECT_EQ(fly1, fly2);
EXPECT_NE(fly1, fly3);
EXPECT_NE(fly2, fly3);
EXPECT(fly1.to_utf16_string().has_short_ascii_storage());
EXPECT(fly2.to_utf16_string().has_short_ascii_storage());
EXPECT(fly3.to_utf16_string().has_short_ascii_storage());
// Short strings do not get stored in the fly string table.
EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 0u);
}
TEST_CASE(long_string)
{
Utf16FlyString fly1 { "thisisdefinitelymorethan7bytes"_utf16 };
EXPECT_EQ(fly1, "thisisdefinitelymorethan7bytes"sv);
EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u);
Utf16FlyString fly2 { "thisisdefinitelymorethan7bytes"_utf16 };
EXPECT_EQ(fly2, "thisisdefinitelymorethan7bytes"sv);
EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u);
Utf16FlyString fly3 { "thisisalsoforsuremorethan7bytes"_utf16 };
EXPECT_EQ(fly3, "thisisalsoforsuremorethan7bytes"sv);
EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 2u);
EXPECT_EQ(fly1, fly2);
EXPECT_NE(fly1, fly3);
EXPECT_NE(fly2, fly3);
EXPECT(fly1.to_utf16_string().has_long_ascii_storage());
EXPECT(fly2.to_utf16_string().has_long_ascii_storage());
EXPECT(fly3.to_utf16_string().has_long_ascii_storage());
}
TEST_CASE(user_defined_literal)
{
auto fly1 = "thisisdefinitelymorethan7bytes"_utf16_fly_string;
EXPECT_EQ(fly1, "thisisdefinitelymorethan7bytes"sv);
EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u);
auto fly2 = "thisisdefinitelymorethan7bytes"_utf16_fly_string;
EXPECT_EQ(fly2, "thisisdefinitelymorethan7bytes"sv);
EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u);
auto fly3 = u"thisisdefinitelymorethan7bytes"_utf16_fly_string;
EXPECT_EQ(fly3, u"thisisdefinitelymorethan7bytes"sv);
EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u);
auto fly4 = "foo"_utf16_fly_string;
EXPECT_EQ(fly4, "foo"sv);
EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u);
EXPECT_EQ(fly1, fly2);
EXPECT_EQ(fly1, fly3);
EXPECT_EQ(fly3, fly3);
EXPECT_NE(fly1, fly4);
EXPECT_NE(fly2, fly4);
EXPECT_NE(fly3, fly4);
}
TEST_CASE(fly_string_keep_string_data_alive)
{
EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 0u);
{
Utf16FlyString fly {};
{
auto string = "thisisdefinitelymorethan7bytes"_utf16;
fly = Utf16FlyString { string };
EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u);
}
EXPECT_EQ(fly, "thisisdefinitelymorethan7bytes"sv);
EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u);
}
EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 0u);
}
TEST_CASE(moved_fly_string_becomes_empty)
{
Utf16FlyString fly1 {};
EXPECT(fly1.is_empty());
Utf16FlyString fly2 { "thisisdefinitelymorethan7bytes"_utf16 };
EXPECT_EQ(fly2, "thisisdefinitelymorethan7bytes"sv);
EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u);
fly1 = move(fly2);
EXPECT(fly2.is_empty());
EXPECT_EQ(fly1, "thisisdefinitelymorethan7bytes"sv);
EXPECT_EQ(Utf16FlyString::number_of_utf16_fly_strings(), 1u);
}
TEST_CASE(is_one_of)
{
auto foo = Utf16FlyString::from_utf8("foo"sv);
auto bar = Utf16FlyString::from_utf16(u"bar"sv);
EXPECT(foo.is_one_of(foo));
EXPECT(foo.is_one_of(foo, bar));
EXPECT(foo.is_one_of(bar, foo));
EXPECT(!foo.is_one_of(bar));
EXPECT(!bar.is_one_of("foo"sv));
EXPECT(bar.is_one_of("foo"sv, "bar"sv));
EXPECT(bar.is_one_of("bar"sv, "foo"sv));
EXPECT(bar.is_one_of("bar"sv));
}