ladybird/Libraries/LibJS/Runtime/Utf16String.cpp
Timothy Flynn fe676585f5 AK: Add a UTF-16 string with optimized short- and ASCII-string storage
This is a strictly UTF-16 string with some optimizations for ASCII.

* If created from a short UTF-8 or UTF-16 string that is also ASCII,
  then the string is stored in an inlined byte buffer.

* If created with a long UTF-8 or UTF-16 string that is also ASCII,
  then the string is stored in an outlined char buffer.

* If created with a short or long UTF-8 or UTF-16 string that is not
  ASCII, then the string is stored in an outlined char16 buffer.

We do not store short non-ASCII text in the inlined buffer to avoid
confusion with operations such as `length_in_code_units` and
`code_unit_at`. For example, "😀" would be stored as 4 UTF-8 bytes
in short string form. But we still want `length_in_code_units` to
be 2, and `code_unit_at(0)` to be 0xD83D.
2025-07-18 12:45:38 -04:00

157 lines
3.5 KiB
C++

/*
* Copyright (c) 2021-2023, Tim Flynn <trflynn89@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/StringView.h>
#include <LibJS/Runtime/Utf16String.h>
#include <LibJS/Runtime/VM.h>
namespace JS {
namespace Detail {
static NonnullRefPtr<Utf16StringImpl> the_empty_utf16_string()
{
static NonnullRefPtr<Utf16StringImpl> empty_string = Utf16StringImpl::create();
return empty_string;
}
Utf16StringImpl::Utf16StringImpl(Utf16Data string)
: m_string(move(string))
{
}
NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create()
{
return adopt_ref(*new Utf16StringImpl);
}
NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16Data string)
{
return adopt_ref(*new Utf16StringImpl(move(string)));
}
NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(StringView string)
{
auto result = MUST(utf8_to_utf16(string));
auto impl = create(move(result.data));
impl->m_cached_view.unsafe_set_code_point_length(result.code_point_count);
return impl;
}
NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16View const& view)
{
Utf16Data string;
string.ensure_capacity(view.length_in_code_units());
if (view.has_ascii_storage()) {
for (size_t i = 0; i < view.length_in_code_units(); ++i)
string.unchecked_append(static_cast<char16_t>(view.code_unit_at(i)));
} else {
string.unchecked_append(view.utf16_span().data(), view.length_in_code_units());
}
auto impl = create(move(string));
if (auto length_in_code_points = view.length_in_code_points_if_known(); length_in_code_points.has_value())
impl->m_cached_view.unsafe_set_code_point_length(*length_in_code_points);
return impl;
}
Utf16Data const& Utf16StringImpl::string() const
{
return m_string;
}
Utf16View Utf16StringImpl::view() const
{
return m_cached_view;
}
u32 Utf16StringImpl::compute_hash() const
{
if (m_string.is_empty())
return 0;
return string_hash((char const*)m_string.data(), m_string.size() * sizeof(u16));
}
}
Utf16String Utf16String::create()
{
return Utf16String { Detail::the_empty_utf16_string() };
}
Utf16String Utf16String::create(Utf16Data string)
{
return Utf16String { Detail::Utf16StringImpl::create(move(string)) };
}
Utf16String Utf16String::create(StringView string)
{
return Utf16String { Detail::Utf16StringImpl::create(string) };
}
Utf16String Utf16String::create(Utf16View const& string)
{
return Utf16String { Detail::Utf16StringImpl::create(string) };
}
Utf16String Utf16String::invalid()
{
static auto invalid = Utf16String {};
return invalid;
}
Utf16String::Utf16String(NonnullRefPtr<Detail::Utf16StringImpl> string)
: m_string(move(string))
{
}
Utf16Data const& Utf16String::string() const
{
return m_string->string();
}
Utf16View Utf16String::view() const
{
return m_string->view();
}
Utf16View Utf16String::substring_view(size_t code_unit_offset, size_t code_unit_length) const
{
return view().substring_view(code_unit_offset, code_unit_length);
}
Utf16View Utf16String::substring_view(size_t code_unit_offset) const
{
return view().substring_view(code_unit_offset);
}
String Utf16String::to_utf8() const
{
return MUST(view().to_utf8());
}
ByteString Utf16String::to_byte_string() const
{
return MUST(view().to_byte_string());
}
u16 Utf16String::code_unit_at(size_t index) const
{
return view().code_unit_at(index);
}
size_t Utf16String::length_in_code_units() const
{
return view().length_in_code_units();
}
bool Utf16String::is_empty() const
{
return view().is_empty();
}
}