mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-07-30 12:49:19 +00:00
AK: Add a UTF-16 string with optimized short- and ASCII-string storage
This is a strictly UTF-16 string with some optimizations for ASCII.
* If created from a short UTF-8 or UTF-16 string that is also ASCII,
then the string is stored in an inlined byte buffer.
* If created with a long UTF-8 or UTF-16 string that is also ASCII,
then the string is stored in an outlined char buffer.
* If created with a short or long UTF-8 or UTF-16 string that is not
ASCII, then the string is stored in an outlined char16 buffer.
We do not store short non-ASCII text in the inlined buffer to avoid
confusion with operations such as `length_in_code_units` and
`code_unit_at`. For example, "😀" would be stored as 4 UTF-8 bytes
in short string form. But we still want `length_in_code_units` to
be 2, and `code_unit_at(0)` to be 0xD83D.
This commit is contained in:
parent
8fbb80fffc
commit
fe676585f5
Notes:
github-actions[bot]
2025-07-18 16:47:31 +00:00
Author: https://github.com/trflynn89
Commit: fe676585f5
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5388
Reviewed-by: https://github.com/shannonbooth ✅
17 changed files with 1527 additions and 44 deletions
117
AK/Utf16String.h
Normal file
117
AK/Utf16String.h
Normal file
|
@ -0,0 +1,117 @@
|
|||
/*
|
||||
* Copyright (c) 2025, Tim Flynn <trflynn89@ladybird.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/Badge.h>
|
||||
#include <AK/Error.h>
|
||||
#include <AK/Format.h>
|
||||
#include <AK/NonnullRefPtr.h>
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Traits.h>
|
||||
#include <AK/UnicodeUtils.h>
|
||||
#include <AK/Utf16StringBase.h>
|
||||
#include <AK/Utf16StringData.h>
|
||||
#include <AK/Utf16View.h>
|
||||
#include <AK/Utf8View.h>
|
||||
|
||||
namespace AK {
|
||||
|
||||
// Utf16String is a strongly owned sequence of Unicode code points encoded as UTF-16.
|
||||
//
|
||||
// The data may or may not be heap-allocated, and may or may not be reference counted. As a memory optimization, if the
|
||||
// UTF-16 string is entirely ASCII, the string is stored as 8-bit bytes.
|
||||
class [[nodiscard]] Utf16String : public Detail::Utf16StringBase {
|
||||
public:
|
||||
using Utf16StringBase::Utf16StringBase;
|
||||
|
||||
explicit constexpr Utf16String(Utf16StringBase&& base)
|
||||
: Utf16StringBase(move(base))
|
||||
{
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static Utf16String from_utf8(StringView utf8_string)
|
||||
{
|
||||
VERIFY(Utf8View { utf8_string }.validate());
|
||||
return from_utf8_without_validation(utf8_string);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static Utf16String from_utf8(String const& utf8_string)
|
||||
{
|
||||
return from_utf8_without_validation(utf8_string);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static ErrorOr<Utf16String> try_from_utf8(StringView utf8_string)
|
||||
{
|
||||
if (!Utf8View { utf8_string }.validate())
|
||||
return Error::from_string_literal("Input was not valid UTF-8");
|
||||
return from_utf8_without_validation(utf8_string);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static Utf16String from_utf16(Utf16View const& utf16_string)
|
||||
{
|
||||
VERIFY(utf16_string.validate());
|
||||
return from_utf16_without_validation(utf16_string);
|
||||
}
|
||||
|
||||
ALWAYS_INLINE static ErrorOr<Utf16String> try_from_utf16(Utf16View const& utf16_string)
|
||||
{
|
||||
if (!utf16_string.validate())
|
||||
return Error::from_string_literal("Input was not valid UTF-16");
|
||||
return from_utf16_without_validation(utf16_string);
|
||||
}
|
||||
|
||||
static Utf16String from_utf8_without_validation(StringView);
|
||||
static Utf16String from_utf16_without_validation(Utf16View const&);
|
||||
static Utf16String from_utf32(Utf32View const&);
|
||||
|
||||
template<typename T>
|
||||
requires(IsOneOf<RemoveCVReference<T>, Utf16String>)
|
||||
static Utf16String from_utf16(T&&) = delete;
|
||||
|
||||
template<typename T>
|
||||
requires(IsOneOf<RemoveCVReference<T>, Utf16String>)
|
||||
static ErrorOr<Utf16String> try_from_utf16(T&&) = delete;
|
||||
|
||||
template<typename T>
|
||||
requires(IsOneOf<RemoveCVReference<T>, Utf16String>)
|
||||
static Utf16String from_utf16_without_validation(T&&) = delete;
|
||||
|
||||
private:
|
||||
ALWAYS_INLINE explicit Utf16String(NonnullRefPtr<Detail::Utf16StringData const> value)
|
||||
: Utf16StringBase(move(value))
|
||||
{
|
||||
}
|
||||
};
|
||||
|
||||
template<>
|
||||
struct Formatter<Utf16String> : Formatter<FormatString> {
|
||||
ErrorOr<void> format(FormatBuilder&, Utf16String const&);
|
||||
};
|
||||
|
||||
template<>
|
||||
struct Traits<Utf16String> : public DefaultTraits<Utf16String> {
|
||||
static unsigned hash(Utf16String const& s) { return s.hash(); }
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE AK::Utf16String operator""_utf16(char const* string, size_t length)
|
||||
{
|
||||
AK::StringView view { string, length };
|
||||
|
||||
ASSERT(AK::Utf8View { view }.validate());
|
||||
return AK::Utf16String::from_utf8_without_validation(view);
|
||||
}
|
||||
|
||||
[[nodiscard]] ALWAYS_INLINE AK::Utf16String operator""_utf16(char16_t const* string, size_t length)
|
||||
{
|
||||
AK::Utf16View view { string, length };
|
||||
|
||||
ASSERT(view.validate());
|
||||
return AK::Utf16String::from_utf16_without_validation(view);
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue