mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-08-09 09:39:39 +00:00
AK+Everywhere: Prepare Utf16View for integration with a UTF-16 string
To prepare for an upcoming Utf16String, this migrates Utf16View to store its data as a char16_t. Most function definitions are moved inline and made constexpr. This also adds a UDL to construct a Utf16View from a string literal: auto string = u"hello"sv; This let's us remove the NTTP Utf16View constructor, as we have found that such constructors bloat binary size quite a bit.
This commit is contained in:
parent
c17b067e1d
commit
86b1c78c1a
Notes:
github-actions[bot]
2025-07-03 13:53:23 +00:00
Author: https://github.com/trflynn89
Commit: 86b1c78c1a
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5228
Reviewed-by: https://github.com/ADKaster ✅
Reviewed-by: https://github.com/shannonbooth
17 changed files with 406 additions and 421 deletions
293
AK/Utf16View.cpp
293
AK/Utf16View.cpp
|
@ -80,77 +80,75 @@ ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const& utf32_view)
|
|||
return Utf16ConversionResult { utf16_data, length };
|
||||
}
|
||||
|
||||
bool validate_utf16_le(ReadonlyBytes bytes)
|
||||
{
|
||||
return simdutf::validate_utf16le(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
|
||||
}
|
||||
|
||||
bool validate_utf16_be(ReadonlyBytes bytes)
|
||||
{
|
||||
return simdutf::validate_utf16be(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
|
||||
}
|
||||
|
||||
size_t utf16_code_unit_length_from_utf8(StringView string)
|
||||
{
|
||||
return simdutf::utf16_length_from_utf8(string.characters_without_null_termination(), string.length());
|
||||
}
|
||||
|
||||
ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const
|
||||
{
|
||||
if (is_empty())
|
||||
return String {};
|
||||
if (!validate(allow_invalid_code_units))
|
||||
return Error::from_string_literal("Input was not valid UTF-16");
|
||||
|
||||
if (allow_invalid_code_units == AllowInvalidCodeUnits::No) {
|
||||
String result;
|
||||
auto utf8_length = simdutf::utf8_length_from_utf16(m_string, length_in_code_units());
|
||||
|
||||
TRY(result.replace_with_new_string(Badge<Utf16View> {}, utf8_length, [&](Bytes buffer) -> ErrorOr<void> {
|
||||
[[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(m_string, length_in_code_units(), reinterpret_cast<char*>(buffer.data()));
|
||||
ASSERT(result == buffer.size());
|
||||
return {};
|
||||
}));
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
StringBuilder builder;
|
||||
builder.append(*this);
|
||||
return builder.to_string();
|
||||
}
|
||||
|
||||
ErrorOr<ByteString> Utf16View::to_byte_string(AllowInvalidCodeUnits allow_invalid_code_units) const
|
||||
{
|
||||
return TRY(to_utf8(allow_invalid_code_units)).to_byte_string();
|
||||
}
|
||||
|
||||
ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const
|
||||
{
|
||||
if (allow_invalid_code_units == AllowInvalidCodeUnits::No)
|
||||
return String::from_utf16(*this);
|
||||
|
||||
StringBuilder builder;
|
||||
builder.append(*this);
|
||||
return builder.to_string();
|
||||
}
|
||||
|
||||
bool Utf16View::is_ascii() const
|
||||
{
|
||||
return simdutf::validate_ascii(reinterpret_cast<char const*>(m_code_units.data()), length_in_code_units() * sizeof(char16_t));
|
||||
return simdutf::validate_ascii(reinterpret_cast<char const*>(m_string), length_in_code_units() * sizeof(char16_t));
|
||||
}
|
||||
|
||||
size_t Utf16View::length_in_code_points() const
|
||||
bool Utf16View::validate(size_t& valid_code_units, AllowInvalidCodeUnits allow_invalid_code_units) const
|
||||
{
|
||||
if (m_length_in_code_points == NumericLimits<size_t>::max())
|
||||
m_length_in_code_points = calculate_length_in_code_points();
|
||||
return m_length_in_code_points;
|
||||
}
|
||||
auto view = *this;
|
||||
valid_code_units = 0;
|
||||
|
||||
u16 Utf16View::code_unit_at(size_t index) const
|
||||
{
|
||||
VERIFY(index < length_in_code_units());
|
||||
return m_code_units[index];
|
||||
}
|
||||
while (!view.is_empty()) {
|
||||
auto result = simdutf::validate_utf16_with_errors(view.m_string, view.length_in_code_units());
|
||||
valid_code_units += result.count;
|
||||
|
||||
u32 Utf16View::code_point_at(size_t index) const
|
||||
{
|
||||
VERIFY(index < length_in_code_units());
|
||||
if (result.error == simdutf::SUCCESS)
|
||||
return true;
|
||||
if (allow_invalid_code_units == AllowInvalidCodeUnits::No || result.error != simdutf::SURROGATE)
|
||||
return false;
|
||||
|
||||
u32 code_point = code_unit_at(index);
|
||||
if (!UnicodeUtils::is_utf16_high_surrogate(code_point) && !UnicodeUtils::is_utf16_low_surrogate(code_point))
|
||||
return code_point;
|
||||
if (UnicodeUtils::is_utf16_low_surrogate(code_point) || (index + 1 == length_in_code_units()))
|
||||
return code_point;
|
||||
|
||||
auto second = code_unit_at(index + 1);
|
||||
if (!UnicodeUtils::is_utf16_low_surrogate(second))
|
||||
return code_point;
|
||||
|
||||
return UnicodeUtils::decode_utf16_surrogate_pair(code_point, second);
|
||||
}
|
||||
|
||||
size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
|
||||
{
|
||||
if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit.
|
||||
return code_unit_offset;
|
||||
|
||||
size_t code_point_offset = 0;
|
||||
|
||||
for (auto it = begin(); it != end(); ++it) {
|
||||
if (code_unit_offset == 0)
|
||||
return code_point_offset;
|
||||
|
||||
code_unit_offset -= it.length_in_code_units();
|
||||
++code_point_offset;
|
||||
view = view.substring_view(result.count + 1);
|
||||
++valid_code_units;
|
||||
}
|
||||
|
||||
return code_point_offset;
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const
|
||||
|
@ -171,19 +169,22 @@ size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const
|
|||
return code_unit_offset;
|
||||
}
|
||||
|
||||
size_t Utf16View::code_unit_offset_of(Utf16CodePointIterator const& it) const
|
||||
size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
|
||||
{
|
||||
VERIFY(it.m_ptr >= begin_ptr());
|
||||
VERIFY(it.m_ptr <= end_ptr());
|
||||
if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit.
|
||||
return code_unit_offset;
|
||||
|
||||
return it.m_ptr - begin_ptr();
|
||||
}
|
||||
size_t code_point_offset = 0;
|
||||
|
||||
Utf16View Utf16View::substring_view(size_t code_unit_offset, size_t code_unit_length) const
|
||||
{
|
||||
VERIFY(!Checked<size_t>::addition_would_overflow(code_unit_offset, code_unit_length));
|
||||
for (auto it = begin(); it != end(); ++it) {
|
||||
if (code_unit_offset == 0)
|
||||
return code_point_offset;
|
||||
|
||||
return Utf16View { m_code_units.slice(code_unit_offset, code_unit_length) };
|
||||
code_unit_offset -= it.length_in_code_units();
|
||||
++code_point_offset;
|
||||
}
|
||||
|
||||
return code_point_offset;
|
||||
}
|
||||
|
||||
Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t code_point_length) const
|
||||
|
@ -194,7 +195,10 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
|
|||
if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit.
|
||||
return substring_view(code_point_offset, code_point_length);
|
||||
|
||||
auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) { return it.m_ptr - begin_ptr(); };
|
||||
auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) {
|
||||
return it.m_iterator - m_string;
|
||||
};
|
||||
|
||||
size_t code_point_index = 0;
|
||||
size_t code_unit_offset = 0;
|
||||
|
||||
|
@ -213,101 +217,13 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
|
|||
VERIFY_NOT_REACHED();
|
||||
}
|
||||
|
||||
Optional<size_t> Utf16View::find_code_unit_offset(Utf16View const& needle, size_t start_offset) const
|
||||
{
|
||||
return m_code_units.index_of(needle.m_code_units, start_offset);
|
||||
}
|
||||
|
||||
Optional<size_t> Utf16View::find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset) const
|
||||
{
|
||||
Checked maximum_offset { start_offset };
|
||||
maximum_offset += needle.length_in_code_units();
|
||||
if (maximum_offset.has_overflow() || maximum_offset.value() > length_in_code_units())
|
||||
return {};
|
||||
|
||||
if (needle.is_empty())
|
||||
return start_offset;
|
||||
|
||||
size_t index = start_offset;
|
||||
while (index <= length_in_code_units() - needle.length_in_code_units()) {
|
||||
Utf16View const slice { m_code_units.slice(index, needle.length_in_code_units()) };
|
||||
if (slice.equals_ignoring_case(needle))
|
||||
return index;
|
||||
index += slice.begin().length_in_code_units();
|
||||
}
|
||||
|
||||
return {};
|
||||
}
|
||||
|
||||
bool Utf16View::starts_with(Utf16View const& needle) const
|
||||
{
|
||||
if (needle.is_empty())
|
||||
return true;
|
||||
if (is_empty())
|
||||
return false;
|
||||
if (needle.length_in_code_units() > length_in_code_units())
|
||||
return false;
|
||||
if (begin_ptr() == needle.begin_ptr())
|
||||
return true;
|
||||
|
||||
for (auto this_it = begin(), needle_it = needle.begin(); needle_it != needle.end(); ++needle_it, ++this_it) {
|
||||
if (*this_it != *needle_it)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
// https://infra.spec.whatwg.org/#code-unit-less-than
|
||||
bool Utf16View::is_code_unit_less_than(Utf16View const& other) const
|
||||
{
|
||||
auto a = m_code_units;
|
||||
auto b = other.m_code_units;
|
||||
|
||||
auto common_length = min(a.size(), b.size());
|
||||
|
||||
for (size_t position = 0; position < common_length; ++position) {
|
||||
if (a[position] != b[position])
|
||||
return a[position] < b[position];
|
||||
}
|
||||
|
||||
return a.size() < b.size();
|
||||
}
|
||||
|
||||
bool Utf16View::validate(AllowInvalidCodeUnits allow_invalid_code_units) const
|
||||
{
|
||||
size_t valid_code_units = 0;
|
||||
return validate(valid_code_units, allow_invalid_code_units);
|
||||
}
|
||||
|
||||
bool Utf16View::validate(size_t& valid_code_units, AllowInvalidCodeUnits allow_invalid_code_units) const
|
||||
{
|
||||
auto view = *this;
|
||||
valid_code_units = 0;
|
||||
|
||||
while (!view.is_empty()) {
|
||||
auto result = simdutf::validate_utf16_with_errors(view.char_data(), view.length_in_code_units());
|
||||
valid_code_units += result.count;
|
||||
|
||||
if (result.error == simdutf::SUCCESS)
|
||||
return true;
|
||||
if (allow_invalid_code_units == AllowInvalidCodeUnits::No || result.error != simdutf::SURROGATE)
|
||||
return false;
|
||||
|
||||
view = view.substring_view(result.count + 1);
|
||||
++valid_code_units;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
size_t Utf16View::calculate_length_in_code_points() const
|
||||
{
|
||||
// FIXME: simdutf's code point length method assumes valid UTF-16, whereas Utf16View uses U+FFFD as a replacement
|
||||
// for invalid code points. If we change Utf16View to only accept valid encodings as an invariant, we can
|
||||
// remove this branch.
|
||||
if (validate()) [[likely]]
|
||||
return simdutf::count_utf16(char_data(), length_in_code_units());
|
||||
return simdutf::count_utf16(m_string, length_in_code_units());
|
||||
|
||||
size_t code_points = 0;
|
||||
for ([[maybe_unused]] auto code_point : *this)
|
||||
|
@ -315,81 +231,4 @@ size_t Utf16View::calculate_length_in_code_points() const
|
|||
return code_points;
|
||||
}
|
||||
|
||||
bool Utf16View::equals_ignoring_case(Utf16View const& other) const
|
||||
{
|
||||
if (length_in_code_units() != other.length_in_code_units())
|
||||
return false;
|
||||
|
||||
for (size_t i = 0; i < length_in_code_units(); ++i) {
|
||||
// FIXME: Handle non-ASCII case insensitive comparisons.
|
||||
if (to_ascii_lowercase(m_code_units[i]) != to_ascii_lowercase(other.m_code_units[i]))
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
Utf16CodePointIterator& Utf16CodePointIterator::operator++()
|
||||
{
|
||||
size_t code_units = length_in_code_units();
|
||||
|
||||
if (code_units > m_remaining_code_units) {
|
||||
// If there aren't enough code units remaining, skip to the end.
|
||||
m_ptr += m_remaining_code_units;
|
||||
m_remaining_code_units = 0;
|
||||
} else {
|
||||
m_ptr += code_units;
|
||||
m_remaining_code_units -= code_units;
|
||||
}
|
||||
|
||||
return *this;
|
||||
}
|
||||
|
||||
u32 Utf16CodePointIterator::operator*() const
|
||||
{
|
||||
VERIFY(m_remaining_code_units > 0);
|
||||
|
||||
// rfc2781, 2.2 Decoding UTF-16
|
||||
// 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
|
||||
// of W1. Terminate.
|
||||
// 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
|
||||
// is in error and no valid character can be obtained using W1.
|
||||
// Terminate.
|
||||
// 3) If there is no W2 (that is, the sequence ends with W1), or if W2
|
||||
// is not between 0xDC00 and 0xDFFF, the sequence is in error.
|
||||
// Terminate.
|
||||
// 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
|
||||
// bits of W1 as its 10 high-order bits and the 10 low-order bits of
|
||||
// W2 as its 10 low-order bits.
|
||||
// 5) Add 0x10000 to U' to obtain the character value U. Terminate.
|
||||
|
||||
auto code_unit = *m_ptr;
|
||||
|
||||
if (UnicodeUtils::is_utf16_high_surrogate(code_unit)) {
|
||||
if (m_remaining_code_units > 1) {
|
||||
auto next_code_unit = *(m_ptr + 1);
|
||||
|
||||
if (UnicodeUtils::is_utf16_low_surrogate(next_code_unit))
|
||||
return UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit);
|
||||
}
|
||||
|
||||
return UnicodeUtils::REPLACEMENT_CODE_POINT;
|
||||
}
|
||||
|
||||
if (UnicodeUtils::is_utf16_low_surrogate(code_unit))
|
||||
return UnicodeUtils::REPLACEMENT_CODE_POINT;
|
||||
|
||||
return static_cast<u32>(code_unit);
|
||||
}
|
||||
|
||||
bool validate_utf16_le(ReadonlyBytes bytes)
|
||||
{
|
||||
return simdutf::validate_utf16le(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
|
||||
}
|
||||
|
||||
bool validate_utf16_be(ReadonlyBytes bytes)
|
||||
{
|
||||
return simdutf::validate_utf16be(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue