AK+Everywhere: Prepare Utf16View for integration with a UTF-16 string

To prepare for an upcoming Utf16String, this migrates Utf16View to store
its data as a char16_t. Most function definitions are moved inline and
made constexpr.

This also adds a UDL to construct a Utf16View from a string literal:

    auto string = u"hello"sv;

This let's us remove the NTTP Utf16View constructor, as we have found
that such constructors bloat binary size quite a bit.
This commit is contained in:
Timothy Flynn 2025-06-26 12:52:23 -04:00 committed by Tim Flynn
commit 86b1c78c1a
Notes: github-actions[bot] 2025-07-03 13:53:23 +00:00
17 changed files with 406 additions and 421 deletions

View file

@ -93,22 +93,7 @@ ErrorOr<String> String::from_utf16_be(ReadonlyBytes bytes)
ErrorOr<String> String::from_utf16(Utf16View const& utf16) ErrorOr<String> String::from_utf16(Utf16View const& utf16)
{ {
if (!utf16.validate()) return utf16.to_utf8();
return Error::from_string_literal("String::from_utf16: Input was not valid UTF-16");
if (utf16.is_empty())
return String {};
String result;
auto utf8_length = simdutf::utf8_length_from_utf16(utf16.char_data(), utf16.length_in_code_units());
TRY(result.replace_with_new_string(utf8_length, [&](Bytes buffer) -> ErrorOr<void> {
[[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(utf16.char_data(), utf16.length_in_code_units(), reinterpret_cast<char*>(buffer.data()));
ASSERT(result == buffer.size());
return {};
}));
return result;
} }
ErrorOr<String> String::from_stream(Stream& stream, size_t byte_count) ErrorOr<String> String::from_stream(Stream& stream, size_t byte_count)

View file

@ -86,6 +86,12 @@ public:
return replace_with_new_string(byte_count, forward<Func>(callback)); return replace_with_new_string(byte_count, forward<Func>(callback));
} }
template<typename Func>
ALWAYS_INLINE ErrorOr<void> replace_with_new_string(Badge<Utf16View>, size_t byte_count, Func&& callback)
{
return replace_with_new_string(byte_count, forward<Func>(callback));
}
protected: protected:
template<typename Func> template<typename Func>
ErrorOr<void> replace_with_new_string(size_t byte_count, Func&& callback) ErrorOr<void> replace_with_new_string(size_t byte_count, Func&& callback)

View file

@ -250,17 +250,17 @@ ErrorOr<void> StringBuilder::try_append(Utf16View const& utf16_view)
if (utf16_view.is_empty()) if (utf16_view.is_empty())
return {}; return {};
auto maximum_utf8_length = UnicodeUtils::maximum_utf8_length_from_utf16(utf16_view.span()); auto remaining_view = utf16_view.span();
auto maximum_utf8_length = UnicodeUtils::maximum_utf8_length_from_utf16(remaining_view);
// Possibly over-allocate a little to ensure we don't have to allocate later. // Possibly over-allocate a little to ensure we don't have to allocate later.
TRY(will_append(maximum_utf8_length)); TRY(will_append(maximum_utf8_length));
Utf16View remaining_view = utf16_view;
for (;;) { for (;;) {
auto uninitialized_data_pointer = static_cast<char*>(m_buffer.end_pointer()); auto* uninitialized_data_pointer = static_cast<char*>(m_buffer.end_pointer());
// Fast path. // Fast path.
auto result = simdutf::convert_utf16_to_utf8_with_errors(remaining_view.char_data(), remaining_view.length_in_code_units(), uninitialized_data_pointer); auto result = simdutf::convert_utf16_to_utf8_with_errors(remaining_view.data(), remaining_view.size(), uninitialized_data_pointer);
if (result.error == simdutf::SUCCESS) { if (result.error == simdutf::SUCCESS) {
auto bytes_just_written = result.count; auto bytes_just_written = result.count;
m_buffer.set_size(m_buffer.size() + bytes_just_written); m_buffer.set_size(m_buffer.size() + bytes_just_written);
@ -269,13 +269,13 @@ ErrorOr<void> StringBuilder::try_append(Utf16View const& utf16_view)
// Slow path. Found unmatched surrogate code unit. // Slow path. Found unmatched surrogate code unit.
auto first_invalid_code_unit = result.count; auto first_invalid_code_unit = result.count;
ASSERT(first_invalid_code_unit < remaining_view.length_in_code_units()); ASSERT(first_invalid_code_unit < remaining_view.size());
// Unfortunately, `simdutf` does not tell us how many bytes it just wrote in case of an error, so we have to calculate it ourselves. // Unfortunately, `simdutf` does not tell us how many bytes it just wrote in case of an error, so we have to calculate it ourselves.
auto bytes_just_written = simdutf::utf8_length_from_utf16(remaining_view.char_data(), first_invalid_code_unit); auto bytes_just_written = simdutf::utf8_length_from_utf16(remaining_view.data(), first_invalid_code_unit);
do { do {
auto code_unit = remaining_view.code_unit_at(first_invalid_code_unit++); auto code_unit = remaining_view[first_invalid_code_unit++];
// Invalid surrogate code units are U+D800 - U+DFFF, so they are always encoded using 3 bytes. // Invalid surrogate code units are U+D800 - U+DFFF, so they are always encoded using 3 bytes.
ASSERT(code_unit >= 0xD800 && code_unit <= 0xDFFF); ASSERT(code_unit >= 0xD800 && code_unit <= 0xDFFF);
@ -283,11 +283,11 @@ ErrorOr<void> StringBuilder::try_append(Utf16View const& utf16_view)
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 12) & 0x0f) | 0xe0); uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 12) & 0x0f) | 0xe0);
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 6) & 0x3f) | 0x80); uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 6) & 0x3f) | 0x80);
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 0) & 0x3f) | 0x80); uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 0) & 0x3f) | 0x80);
} while (first_invalid_code_unit < remaining_view.length_in_code_units() && UnicodeUtils::is_utf16_low_surrogate(remaining_view.data()[first_invalid_code_unit])); } while (first_invalid_code_unit < remaining_view.size() && UnicodeUtils::is_utf16_low_surrogate(remaining_view.data()[first_invalid_code_unit]));
// Code unit might no longer be invalid, retry on the remaining data. // Code unit might no longer be invalid, retry on the remaining data.
m_buffer.set_size(m_buffer.size() + bytes_just_written); m_buffer.set_size(m_buffer.size() + bytes_just_written);
remaining_view = remaining_view.substring_view(first_invalid_code_unit); remaining_view = remaining_view.slice(first_invalid_code_unit);
} }
return {}; return {};

View file

@ -212,7 +212,7 @@ String StringView::to_ascii_lowercase_string() const
String result; String result;
MUST(result.replace_with_new_string({}, length(), [&](Bytes buffer) -> ErrorOr<void> { MUST(result.replace_with_new_string(Badge<StringView> {}, length(), [&](Bytes buffer) -> ErrorOr<void> {
for (auto [i, character] : enumerate(bytes())) for (auto [i, character] : enumerate(bytes()))
buffer[i] = static_cast<u8>(AK::to_ascii_lowercase(character)); buffer[i] = static_cast<u8>(AK::to_ascii_lowercase(character));
return {}; return {};
@ -227,7 +227,7 @@ String StringView::to_ascii_uppercase_string() const
String result; String result;
MUST(result.replace_with_new_string({}, length(), [&](Bytes buffer) -> ErrorOr<void> { MUST(result.replace_with_new_string(Badge<StringView> {}, length(), [&](Bytes buffer) -> ErrorOr<void> {
for (auto [i, character] : enumerate(bytes())) for (auto [i, character] : enumerate(bytes()))
buffer[i] = static_cast<u8>(AK::to_ascii_uppercase(character)); buffer[i] = static_cast<u8>(AK::to_ascii_uppercase(character));
return {}; return {};

View file

@ -158,7 +158,7 @@ constexpr ErrorOr<size_t> try_code_point_to_utf16(u32 code_point, Callback callb
* Compute the maximum number of UTF-8 bytes needed to store a given UTF-16 string, accounting for unmatched UTF-16 surrogates. * Compute the maximum number of UTF-8 bytes needed to store a given UTF-16 string, accounting for unmatched UTF-16 surrogates.
* This function will overcount by at most 33%; 2 bytes for every valid UTF-16 codepoint between U+100000 and U+10FFFF. * This function will overcount by at most 33%; 2 bytes for every valid UTF-16 codepoint between U+100000 and U+10FFFF.
*/ */
[[nodiscard]] static inline size_t maximum_utf8_length_from_utf16(ReadonlySpan<u16> code_units) [[nodiscard]] static inline size_t maximum_utf8_length_from_utf16(ReadonlySpan<char16_t> code_units)
{ {
// # UTF-8 code point -> no. UTF-8 bytes needed // # UTF-8 code point -> no. UTF-8 bytes needed
// U+0000 - U+007F => 1 UTF-8 bytes // U+0000 - U+007F => 1 UTF-8 bytes

View file

@ -80,77 +80,75 @@ ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const& utf32_view)
return Utf16ConversionResult { utf16_data, length }; return Utf16ConversionResult { utf16_data, length };
} }
bool validate_utf16_le(ReadonlyBytes bytes)
{
return simdutf::validate_utf16le(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
}
bool validate_utf16_be(ReadonlyBytes bytes)
{
return simdutf::validate_utf16be(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
}
size_t utf16_code_unit_length_from_utf8(StringView string) size_t utf16_code_unit_length_from_utf8(StringView string)
{ {
return simdutf::utf16_length_from_utf8(string.characters_without_null_termination(), string.length()); return simdutf::utf16_length_from_utf8(string.characters_without_null_termination(), string.length());
} }
ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const
{
if (is_empty())
return String {};
if (!validate(allow_invalid_code_units))
return Error::from_string_literal("Input was not valid UTF-16");
if (allow_invalid_code_units == AllowInvalidCodeUnits::No) {
String result;
auto utf8_length = simdutf::utf8_length_from_utf16(m_string, length_in_code_units());
TRY(result.replace_with_new_string(Badge<Utf16View> {}, utf8_length, [&](Bytes buffer) -> ErrorOr<void> {
[[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(m_string, length_in_code_units(), reinterpret_cast<char*>(buffer.data()));
ASSERT(result == buffer.size());
return {};
}));
return result;
}
StringBuilder builder;
builder.append(*this);
return builder.to_string();
}
ErrorOr<ByteString> Utf16View::to_byte_string(AllowInvalidCodeUnits allow_invalid_code_units) const ErrorOr<ByteString> Utf16View::to_byte_string(AllowInvalidCodeUnits allow_invalid_code_units) const
{ {
return TRY(to_utf8(allow_invalid_code_units)).to_byte_string(); return TRY(to_utf8(allow_invalid_code_units)).to_byte_string();
} }
ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const
{
if (allow_invalid_code_units == AllowInvalidCodeUnits::No)
return String::from_utf16(*this);
StringBuilder builder;
builder.append(*this);
return builder.to_string();
}
bool Utf16View::is_ascii() const bool Utf16View::is_ascii() const
{ {
return simdutf::validate_ascii(reinterpret_cast<char const*>(m_code_units.data()), length_in_code_units() * sizeof(char16_t)); return simdutf::validate_ascii(reinterpret_cast<char const*>(m_string), length_in_code_units() * sizeof(char16_t));
} }
size_t Utf16View::length_in_code_points() const bool Utf16View::validate(size_t& valid_code_units, AllowInvalidCodeUnits allow_invalid_code_units) const
{ {
if (m_length_in_code_points == NumericLimits<size_t>::max()) auto view = *this;
m_length_in_code_points = calculate_length_in_code_points(); valid_code_units = 0;
return m_length_in_code_points;
}
u16 Utf16View::code_unit_at(size_t index) const while (!view.is_empty()) {
{ auto result = simdutf::validate_utf16_with_errors(view.m_string, view.length_in_code_units());
VERIFY(index < length_in_code_units()); valid_code_units += result.count;
return m_code_units[index];
}
u32 Utf16View::code_point_at(size_t index) const if (result.error == simdutf::SUCCESS)
{ return true;
VERIFY(index < length_in_code_units()); if (allow_invalid_code_units == AllowInvalidCodeUnits::No || result.error != simdutf::SURROGATE)
return false;
u32 code_point = code_unit_at(index); view = view.substring_view(result.count + 1);
if (!UnicodeUtils::is_utf16_high_surrogate(code_point) && !UnicodeUtils::is_utf16_low_surrogate(code_point)) ++valid_code_units;
return code_point;
if (UnicodeUtils::is_utf16_low_surrogate(code_point) || (index + 1 == length_in_code_units()))
return code_point;
auto second = code_unit_at(index + 1);
if (!UnicodeUtils::is_utf16_low_surrogate(second))
return code_point;
return UnicodeUtils::decode_utf16_surrogate_pair(code_point, second);
}
size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
{
if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit.
return code_unit_offset;
size_t code_point_offset = 0;
for (auto it = begin(); it != end(); ++it) {
if (code_unit_offset == 0)
return code_point_offset;
code_unit_offset -= it.length_in_code_units();
++code_point_offset;
} }
return code_point_offset; return true;
} }
size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const
@ -171,19 +169,22 @@ size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const
return code_unit_offset; return code_unit_offset;
} }
size_t Utf16View::code_unit_offset_of(Utf16CodePointIterator const& it) const size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
{ {
VERIFY(it.m_ptr >= begin_ptr()); if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit.
VERIFY(it.m_ptr <= end_ptr()); return code_unit_offset;
return it.m_ptr - begin_ptr(); size_t code_point_offset = 0;
}
Utf16View Utf16View::substring_view(size_t code_unit_offset, size_t code_unit_length) const for (auto it = begin(); it != end(); ++it) {
{ if (code_unit_offset == 0)
VERIFY(!Checked<size_t>::addition_would_overflow(code_unit_offset, code_unit_length)); return code_point_offset;
return Utf16View { m_code_units.slice(code_unit_offset, code_unit_length) }; code_unit_offset -= it.length_in_code_units();
++code_point_offset;
}
return code_point_offset;
} }
Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t code_point_length) const Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t code_point_length) const
@ -194,7 +195,10 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit. if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit.
return substring_view(code_point_offset, code_point_length); return substring_view(code_point_offset, code_point_length);
auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) { return it.m_ptr - begin_ptr(); }; auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) {
return it.m_iterator - m_string;
};
size_t code_point_index = 0; size_t code_point_index = 0;
size_t code_unit_offset = 0; size_t code_unit_offset = 0;
@ -213,101 +217,13 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
VERIFY_NOT_REACHED(); VERIFY_NOT_REACHED();
} }
Optional<size_t> Utf16View::find_code_unit_offset(Utf16View const& needle, size_t start_offset) const
{
return m_code_units.index_of(needle.m_code_units, start_offset);
}
Optional<size_t> Utf16View::find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset) const
{
Checked maximum_offset { start_offset };
maximum_offset += needle.length_in_code_units();
if (maximum_offset.has_overflow() || maximum_offset.value() > length_in_code_units())
return {};
if (needle.is_empty())
return start_offset;
size_t index = start_offset;
while (index <= length_in_code_units() - needle.length_in_code_units()) {
Utf16View const slice { m_code_units.slice(index, needle.length_in_code_units()) };
if (slice.equals_ignoring_case(needle))
return index;
index += slice.begin().length_in_code_units();
}
return {};
}
bool Utf16View::starts_with(Utf16View const& needle) const
{
if (needle.is_empty())
return true;
if (is_empty())
return false;
if (needle.length_in_code_units() > length_in_code_units())
return false;
if (begin_ptr() == needle.begin_ptr())
return true;
for (auto this_it = begin(), needle_it = needle.begin(); needle_it != needle.end(); ++needle_it, ++this_it) {
if (*this_it != *needle_it)
return false;
}
return true;
}
// https://infra.spec.whatwg.org/#code-unit-less-than
bool Utf16View::is_code_unit_less_than(Utf16View const& other) const
{
auto a = m_code_units;
auto b = other.m_code_units;
auto common_length = min(a.size(), b.size());
for (size_t position = 0; position < common_length; ++position) {
if (a[position] != b[position])
return a[position] < b[position];
}
return a.size() < b.size();
}
bool Utf16View::validate(AllowInvalidCodeUnits allow_invalid_code_units) const
{
size_t valid_code_units = 0;
return validate(valid_code_units, allow_invalid_code_units);
}
bool Utf16View::validate(size_t& valid_code_units, AllowInvalidCodeUnits allow_invalid_code_units) const
{
auto view = *this;
valid_code_units = 0;
while (!view.is_empty()) {
auto result = simdutf::validate_utf16_with_errors(view.char_data(), view.length_in_code_units());
valid_code_units += result.count;
if (result.error == simdutf::SUCCESS)
return true;
if (allow_invalid_code_units == AllowInvalidCodeUnits::No || result.error != simdutf::SURROGATE)
return false;
view = view.substring_view(result.count + 1);
++valid_code_units;
}
return true;
}
size_t Utf16View::calculate_length_in_code_points() const size_t Utf16View::calculate_length_in_code_points() const
{ {
// FIXME: simdutf's code point length method assumes valid UTF-16, whereas Utf16View uses U+FFFD as a replacement // FIXME: simdutf's code point length method assumes valid UTF-16, whereas Utf16View uses U+FFFD as a replacement
// for invalid code points. If we change Utf16View to only accept valid encodings as an invariant, we can // for invalid code points. If we change Utf16View to only accept valid encodings as an invariant, we can
// remove this branch. // remove this branch.
if (validate()) [[likely]] if (validate()) [[likely]]
return simdutf::count_utf16(char_data(), length_in_code_units()); return simdutf::count_utf16(m_string, length_in_code_units());
size_t code_points = 0; size_t code_points = 0;
for ([[maybe_unused]] auto code_point : *this) for ([[maybe_unused]] auto code_point : *this)
@ -315,81 +231,4 @@ size_t Utf16View::calculate_length_in_code_points() const
return code_points; return code_points;
} }
bool Utf16View::equals_ignoring_case(Utf16View const& other) const
{
if (length_in_code_units() != other.length_in_code_units())
return false;
for (size_t i = 0; i < length_in_code_units(); ++i) {
// FIXME: Handle non-ASCII case insensitive comparisons.
if (to_ascii_lowercase(m_code_units[i]) != to_ascii_lowercase(other.m_code_units[i]))
return false;
}
return true;
}
Utf16CodePointIterator& Utf16CodePointIterator::operator++()
{
size_t code_units = length_in_code_units();
if (code_units > m_remaining_code_units) {
// If there aren't enough code units remaining, skip to the end.
m_ptr += m_remaining_code_units;
m_remaining_code_units = 0;
} else {
m_ptr += code_units;
m_remaining_code_units -= code_units;
}
return *this;
}
u32 Utf16CodePointIterator::operator*() const
{
VERIFY(m_remaining_code_units > 0);
// rfc2781, 2.2 Decoding UTF-16
// 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
// of W1. Terminate.
// 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
// is in error and no valid character can be obtained using W1.
// Terminate.
// 3) If there is no W2 (that is, the sequence ends with W1), or if W2
// is not between 0xDC00 and 0xDFFF, the sequence is in error.
// Terminate.
// 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
// bits of W1 as its 10 high-order bits and the 10 low-order bits of
// W2 as its 10 low-order bits.
// 5) Add 0x10000 to U' to obtain the character value U. Terminate.
auto code_unit = *m_ptr;
if (UnicodeUtils::is_utf16_high_surrogate(code_unit)) {
if (m_remaining_code_units > 1) {
auto next_code_unit = *(m_ptr + 1);
if (UnicodeUtils::is_utf16_low_surrogate(next_code_unit))
return UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit);
}
return UnicodeUtils::REPLACEMENT_CODE_POINT;
}
if (UnicodeUtils::is_utf16_low_surrogate(code_unit))
return UnicodeUtils::REPLACEMENT_CODE_POINT;
return static_cast<u32>(code_unit);
}
bool validate_utf16_le(ReadonlyBytes bytes)
{
return simdutf::validate_utf16le(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
}
bool validate_utf16_be(ReadonlyBytes bytes)
{
return simdutf::validate_utf16be(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
}
} }

View file

@ -10,6 +10,7 @@
#include <AK/Error.h> #include <AK/Error.h>
#include <AK/Format.h> #include <AK/Format.h>
#include <AK/Forward.h> #include <AK/Forward.h>
#include <AK/MemMem.h>
#include <AK/Optional.h> #include <AK/Optional.h>
#include <AK/Span.h> #include <AK/Span.h>
#include <AK/String.h> #include <AK/String.h>
@ -21,7 +22,7 @@
namespace AK { namespace AK {
using Utf16Data = Vector<u16, 1>; using Utf16Data = Vector<char16_t, 1>;
struct Utf16ConversionResult { struct Utf16ConversionResult {
Utf16Data data; Utf16Data data;
@ -36,8 +37,6 @@ ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const&);
size_t utf16_code_unit_length_from_utf8(StringView); size_t utf16_code_unit_length_from_utf8(StringView);
class Utf16View;
class Utf16CodePointIterator { class Utf16CodePointIterator {
friend class Utf16View; friend class Utf16View;
@ -45,27 +44,57 @@ public:
Utf16CodePointIterator() = default; Utf16CodePointIterator() = default;
~Utf16CodePointIterator() = default; ~Utf16CodePointIterator() = default;
bool operator==(Utf16CodePointIterator const& other) const constexpr Utf16CodePointIterator& operator++()
{ {
return (m_ptr == other.m_ptr) && (m_remaining_code_units == other.m_remaining_code_units); VERIFY(m_remaining_code_units > 0);
auto length = min(length_in_code_units(), m_remaining_code_units);
m_iterator += length;
m_remaining_code_units -= length;
return *this;
} }
Utf16CodePointIterator& operator++(); constexpr u32 operator*() const
u32 operator*() const; {
VERIFY(m_remaining_code_units > 0);
auto code_unit = *m_iterator;
size_t length_in_code_units() const if (UnicodeUtils::is_utf16_high_surrogate(code_unit)) {
if (m_remaining_code_units > 1) {
auto next_code_unit = *(m_iterator + 1);
if (UnicodeUtils::is_utf16_low_surrogate(next_code_unit))
return UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit);
}
return UnicodeUtils::REPLACEMENT_CODE_POINT;
}
if (UnicodeUtils::is_utf16_low_surrogate(code_unit))
return UnicodeUtils::REPLACEMENT_CODE_POINT;
return static_cast<u32>(code_unit);
}
[[nodiscard]] constexpr bool operator==(Utf16CodePointIterator const& other) const
{
return (m_iterator == other.m_iterator) && (m_remaining_code_units == other.m_remaining_code_units);
}
[[nodiscard]] constexpr size_t length_in_code_units() const
{ {
return UnicodeUtils::code_unit_length_for_code_point(**this); return UnicodeUtils::code_unit_length_for_code_point(**this);
} }
private: private:
Utf16CodePointIterator(u16 const* ptr, size_t length) Utf16CodePointIterator(char16_t const* ptr, size_t length)
: m_ptr(ptr) : m_iterator(ptr)
, m_remaining_code_units(length) , m_remaining_code_units(length)
{ {
} }
u16 const* m_ptr { nullptr }; char16_t const* m_iterator { nullptr };
size_t m_remaining_code_units { 0 }; size_t m_remaining_code_units { 0 };
}; };
@ -73,101 +102,233 @@ class Utf16View {
public: public:
using Iterator = Utf16CodePointIterator; using Iterator = Utf16CodePointIterator;
enum class AllowInvalidCodeUnits {
No,
Yes,
};
Utf16View() = default; Utf16View() = default;
~Utf16View() = default; ~Utf16View() = default;
explicit Utf16View(ReadonlySpan<u16> code_units) constexpr Utf16View(char16_t const* string, size_t length_in_code_units)
: m_code_units(code_units) : m_string(string)
, m_length_in_code_units(length_in_code_units)
{
}
constexpr Utf16View(Utf16Data const& string)
: m_string(string.data())
, m_length_in_code_units(string.size())
{ {
} }
Utf16View(Utf16ConversionResult&&) = delete; Utf16View(Utf16ConversionResult&&) = delete;
explicit Utf16View(Utf16ConversionResult const& conversion_result) explicit Utf16View(Utf16ConversionResult const& conversion_result)
: m_code_units(conversion_result.data) : m_string(conversion_result.data.data())
, m_length_in_code_units(conversion_result.data.size())
, m_length_in_code_points(conversion_result.code_point_count) , m_length_in_code_points(conversion_result.code_point_count)
{ {
} }
template<size_t Size> ErrorOr<String> to_utf8(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
Utf16View(char16_t const (&code_units)[Size]) ErrorOr<ByteString> to_byte_string(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
: m_code_units(
reinterpret_cast<u16 const*>(&code_units[0]), [[nodiscard]] constexpr ReadonlySpan<char16_t> span() const
code_units[Size - 1] == u'\0' ? Size - 1 : Size)
{ {
return { m_string, length_in_code_units() };
} }
bool operator==(Utf16View const& other) const { return m_code_units == other.m_code_units; } [[nodiscard]] constexpr bool operator==(Utf16View const& other) const
{
if (length_in_code_units() != other.length_in_code_units())
return false;
return TypedTransfer<char16_t>::compare(m_string, other.m_string, length_in_code_units());
}
enum class AllowInvalidCodeUnits { [[nodiscard]] constexpr bool equals_ignoring_case(Utf16View const& other) const
Yes, {
No, // FIXME: Handle non-ASCII case insensitive comparisons.
}; return equals_ignoring_ascii_case(other);
}
ErrorOr<ByteString> to_byte_string(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; [[nodiscard]] constexpr bool equals_ignoring_ascii_case(Utf16View const& other) const
ErrorOr<String> to_utf8(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; {
if (length_in_code_units() != other.length_in_code_units())
return false;
void unsafe_set_code_point_length(size_t length) const { m_length_in_code_points = length; } for (size_t i = 0; i < length_in_code_units(); ++i) {
if (to_ascii_lowercase(code_unit_at(i)) != to_ascii_lowercase(other.code_unit_at(i)))
return false;
}
bool is_null() const { return m_code_units.is_null(); } return true;
bool is_empty() const { return m_code_units.is_empty(); } }
bool is_ascii() const;
size_t length_in_code_units() const { return m_code_units.size(); } template<typename... Ts>
size_t length_in_code_points() const; [[nodiscard]] constexpr bool is_one_of(Ts&&... strings) const
{
return (this->operator==(forward<Ts>(strings)) || ...);
}
Optional<size_t> length_in_code_points_if_known() const template<typename... Ts>
[[nodiscard]] constexpr bool is_one_of_ignoring_ascii_case(Ts&&... strings) const
{
return (this->equals_ignoring_ascii_case(forward<Ts>(strings)) || ...);
}
[[nodiscard]] constexpr u32 hash() const
{
if (is_empty())
return 0;
return string_hash(reinterpret_cast<char const*>(m_string), length_in_code_units() * sizeof(char16_t));
}
[[nodiscard]] constexpr bool is_null() const { return m_string == nullptr; }
[[nodiscard]] constexpr bool is_empty() const { return length_in_code_units() == 0; }
[[nodiscard]] bool is_ascii() const;
[[nodiscard]] ALWAYS_INLINE bool validate(AllowInvalidCodeUnits allow_invalid_code_units = AllowInvalidCodeUnits::No) const
{
size_t valid_code_units = 0;
return validate(valid_code_units, allow_invalid_code_units);
}
[[nodiscard]] bool validate(size_t& valid_code_units, AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
[[nodiscard]] constexpr size_t length_in_code_units() const { return m_length_in_code_units; }
[[nodiscard]] ALWAYS_INLINE size_t length_in_code_points() const
{
if (m_length_in_code_points == NumericLimits<size_t>::max())
m_length_in_code_points = calculate_length_in_code_points();
return m_length_in_code_points;
}
constexpr Optional<size_t> length_in_code_points_if_known() const
{ {
if (m_length_in_code_points == NumericLimits<size_t>::max()) if (m_length_in_code_points == NumericLimits<size_t>::max())
return {}; return {};
return m_length_in_code_points; return m_length_in_code_points;
} }
u32 hash() const constexpr void unsafe_set_code_point_length(size_t length) const { m_length_in_code_points = length; }
[[nodiscard]] constexpr char16_t code_unit_at(size_t index) const
{ {
if (is_empty()) VERIFY(index < length_in_code_units());
return 0; return m_string[index];
return string_hash(reinterpret_cast<char const*>(m_code_units.data()), m_code_units.size() * sizeof(u16));
} }
Utf16CodePointIterator begin() const { return { begin_ptr(), m_code_units.size() }; } [[nodiscard]] constexpr u32 code_point_at(size_t index) const
Utf16CodePointIterator end() const { return { end_ptr(), 0 }; } {
VERIFY(index < length_in_code_units());
u32 code_point = code_unit_at(index);
u16 const* data() const { return m_code_units.data(); } if (!UnicodeUtils::is_utf16_high_surrogate(code_point) && !UnicodeUtils::is_utf16_low_surrogate(code_point))
char16_t const* char_data() const { return reinterpret_cast<char16_t const*>(data()); } return code_point;
if (UnicodeUtils::is_utf16_low_surrogate(code_point) || (index + 1 == length_in_code_units()))
return code_point;
ReadonlySpan<u16> span() const { return m_code_units; } auto second = code_unit_at(index + 1);
if (!UnicodeUtils::is_utf16_low_surrogate(second))
return code_point;
u16 code_unit_at(size_t index) const; return UnicodeUtils::decode_utf16_surrogate_pair(code_point, second);
u32 code_point_at(size_t index) const; }
size_t code_point_offset_of(size_t code_unit_offset) const; [[nodiscard]] size_t code_unit_offset_of(size_t code_point_offset) const;
size_t code_unit_offset_of(size_t code_point_offset) const; [[nodiscard]] size_t code_point_offset_of(size_t code_unit_offset) const;
size_t code_unit_offset_of(Utf16CodePointIterator const&) const;
Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const; [[nodiscard]] constexpr Utf16CodePointIterator begin() const
Utf16View substring_view(size_t code_unit_offset) const { return substring_view(code_unit_offset, length_in_code_units() - code_unit_offset); } {
return { m_string, length_in_code_units() };
}
Utf16View unicode_substring_view(size_t code_point_offset, size_t code_point_length) const; [[nodiscard]] constexpr Utf16CodePointIterator end() const
Utf16View unicode_substring_view(size_t code_point_offset) const { return unicode_substring_view(code_point_offset, length_in_code_points() - code_point_offset); } {
return { m_string + length_in_code_units(), 0 };
}
Optional<size_t> find_code_unit_offset(Utf16View const& needle, size_t start_offset = 0) const; [[nodiscard]] constexpr Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const
Optional<size_t> find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset = 0) const; {
VERIFY(code_unit_offset + code_unit_length <= length_in_code_units());
return { m_string + code_unit_offset, code_unit_length };
}
bool starts_with(Utf16View const&) const; [[nodiscard]] constexpr Utf16View substring_view(size_t code_unit_offset) const { return substring_view(code_unit_offset, length_in_code_units() - code_unit_offset); }
bool is_code_unit_less_than(Utf16View const& other) const;
bool validate(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; [[nodiscard]] Utf16View unicode_substring_view(size_t code_point_offset, size_t code_point_length) const;
bool validate(size_t& valid_code_units, AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; [[nodiscard]] Utf16View unicode_substring_view(size_t code_point_offset) const { return unicode_substring_view(code_point_offset, length_in_code_points() - code_point_offset); }
bool equals_ignoring_case(Utf16View const&) const; constexpr Optional<size_t> find_code_unit_offset(char16_t needle, size_t start_offset = 0) const
{
if (start_offset >= length_in_code_units())
return {};
return AK::memmem_optional(m_string + start_offset, (length_in_code_units() - start_offset) * sizeof(char16_t), &needle, sizeof(needle));
}
constexpr Optional<size_t> find_code_unit_offset(Utf16View const& needle, size_t start_offset = 0) const
{
return span().index_of(needle.span(), start_offset);
}
constexpr Optional<size_t> find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset = 0) const
{
Checked maximum_offset { start_offset };
maximum_offset += needle.length_in_code_units();
if (maximum_offset.has_overflow() || maximum_offset.value() > length_in_code_units())
return {};
if (needle.is_empty())
return start_offset;
size_t index = start_offset;
while (index <= length_in_code_units() - needle.length_in_code_units()) {
auto slice = substring_view(index, needle.length_in_code_units());
if (slice.equals_ignoring_case(needle))
return index;
index += slice.begin().length_in_code_units();
}
return {};
}
[[nodiscard]] constexpr bool starts_with(Utf16View const& needle) const
{
if (needle.is_empty())
return true;
if (is_empty())
return false;
if (needle.length_in_code_units() > length_in_code_units())
return false;
if (m_string == needle.m_string)
return true;
return span().starts_with(needle.span());
}
// https://infra.spec.whatwg.org/#code-unit-less-than
[[nodiscard]] constexpr bool is_code_unit_less_than(Utf16View const& other) const
{
auto common_length = min(length_in_code_units(), other.length_in_code_units());
for (size_t position = 0; position < common_length; ++position) {
auto this_code_unit = code_unit_at(position);
auto other_code_unit = other.code_unit_at(position);
if (this_code_unit != other_code_unit)
return this_code_unit < other_code_unit;
}
return length_in_code_units() < other.length_in_code_units();
}
private: private:
u16 const* begin_ptr() const { return m_code_units.data(); } [[nodiscard]] size_t calculate_length_in_code_points() const;
u16 const* end_ptr() const { return begin_ptr() + m_code_units.size(); }
size_t calculate_length_in_code_points() const; char16_t const* m_string { nullptr };
size_t m_length_in_code_units { 0 };
ReadonlySpan<u16> m_code_units;
mutable size_t m_length_in_code_points { NumericLimits<size_t>::max() }; mutable size_t m_length_in_code_points { NumericLimits<size_t>::max() };
}; };
@ -188,6 +349,13 @@ struct Traits<Utf16View> : public DefaultTraits<Utf16View> {
} }
[[nodiscard]] ALWAYS_INLINE AK_STRING_VIEW_LITERAL_CONSTEVAL AK::Utf16View operator""sv(char16_t const* string, size_t length)
{
AK::Utf16View view { string, length };
ASSERT(view.validate());
return view;
}
#if USING_AK_GLOBALLY #if USING_AK_GLOBALLY
using AK::Utf16Data; using AK::Utf16Data;
using AK::Utf16View; using AK::Utf16View;

View file

@ -111,7 +111,7 @@ ErrorOr<String> Process::get_name()
if (!length) if (!length)
return Error::from_windows_error(); return Error::from_windows_error();
return String::from_utf16(Utf16View { { (u16*)path, length } }); return String::from_utf16(Utf16View { reinterpret_cast<char16_t const*>(path), length });
} }
ErrorOr<void> Process::set_name(StringView, SetThreadName) ErrorOr<void> Process::set_name(StringView, SetThreadName)

View file

@ -1271,33 +1271,33 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
Optional<Utf16String> capture_string; Optional<Utf16String> capture_string;
// b. If templateRemainder starts with "$$", then // b. If templateRemainder starts with "$$", then
if (template_remainder.starts_with(u"$$")) { if (template_remainder.starts_with(u"$$"sv)) {
// i. Let ref be "$$". // i. Let ref be "$$".
ref = u"$$"; ref = u"$$"sv;
// ii. Let refReplacement be "$". // ii. Let refReplacement be "$".
ref_replacement = u"$"; ref_replacement = u"$"sv;
} }
// c. Else if templateRemainder starts with "$`", then // c. Else if templateRemainder starts with "$`", then
else if (template_remainder.starts_with(u"$`")) { else if (template_remainder.starts_with(u"$`"sv)) {
// i. Let ref be "$`". // i. Let ref be "$`".
ref = u"$`"; ref = u"$`"sv;
// ii. Let refReplacement be the substring of str from 0 to position. // ii. Let refReplacement be the substring of str from 0 to position.
ref_replacement = str.substring_view(0, position); ref_replacement = str.substring_view(0, position);
} }
// d. Else if templateRemainder starts with "$&", then // d. Else if templateRemainder starts with "$&", then
else if (template_remainder.starts_with(u"$&")) { else if (template_remainder.starts_with(u"$&"sv)) {
// i. Let ref be "$&". // i. Let ref be "$&".
ref = u"$&"; ref = u"$&"sv;
// ii. Let refReplacement be matched. // ii. Let refReplacement be matched.
ref_replacement = matched; ref_replacement = matched;
} }
// e. Else if templateRemainder starts with "$'" (0x0024 (DOLLAR SIGN) followed by 0x0027 (APOSTROPHE)), then // e. Else if templateRemainder starts with "$'" (0x0024 (DOLLAR SIGN) followed by 0x0027 (APOSTROPHE)), then
else if (template_remainder.starts_with(u"$'")) { else if (template_remainder.starts_with(u"$'"sv)) {
// i. Let ref be "$'". // i. Let ref be "$'".
ref = u"$'"; ref = u"$'"sv;
// ii. Let matchLength be the length of matched. // ii. Let matchLength be the length of matched.
auto match_length = matched.length_in_code_units(); auto match_length = matched.length_in_code_units();
@ -1311,7 +1311,7 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
// v. NOTE: tailPos can exceed stringLength only if this abstract operation was invoked by a call to the intrinsic @@replace method of %RegExp.prototype% on an object whose "exec" property is not the intrinsic %RegExp.prototype.exec%. // v. NOTE: tailPos can exceed stringLength only if this abstract operation was invoked by a call to the intrinsic @@replace method of %RegExp.prototype% on an object whose "exec" property is not the intrinsic %RegExp.prototype.exec%.
} }
// f. Else if templateRemainder starts with "$" followed by 1 or more decimal digits, then // f. Else if templateRemainder starts with "$" followed by 1 or more decimal digits, then
else if (template_remainder.starts_with(u"$") && template_remainder.length_in_code_units() > 1 && is_ascii_digit(template_remainder.code_unit_at(1))) { else if (template_remainder.starts_with(u"$"sv) && template_remainder.length_in_code_units() > 1 && is_ascii_digit(template_remainder.code_unit_at(1))) {
// i. If templateRemainder starts with "$" followed by 2 or more decimal digits, let digitCount be 2. Otherwise, let digitCount be 1. // i. If templateRemainder starts with "$" followed by 2 or more decimal digits, let digitCount be 2. Otherwise, let digitCount be 1.
size_t digit_count = 1; size_t digit_count = 1;
@ -1373,15 +1373,15 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
} }
} }
// g. Else if templateRemainder starts with "$<", then // g. Else if templateRemainder starts with "$<", then
else if (template_remainder.starts_with(u"$<")) { else if (template_remainder.starts_with(u"$<"sv)) {
// i. Let gtPos be StringIndexOf(templateRemainder, ">", 0). // i. Let gtPos be StringIndexOf(templateRemainder, ">", 0).
// NOTE: We can actually start at index 2 because we know the string starts with "$<". // NOTE: We can actually start at index 2 because we know the string starts with "$<".
auto greater_than_position = string_index_of(template_remainder, u">", 2); auto greater_than_position = string_index_of(template_remainder, u">"sv, 2);
// ii. If gtPos = -1 or namedCaptures is undefined, then // ii. If gtPos = -1 or namedCaptures is undefined, then
if (!greater_than_position.has_value() || named_captures.is_undefined()) { if (!greater_than_position.has_value() || named_captures.is_undefined()) {
// 1. Let ref be "$<". // 1. Let ref be "$<".
ref = u"$<"; ref = u"$<"sv;
// 2. Let refReplacement be ref. // 2. Let refReplacement be ref.
ref_replacement = ref; ref_replacement = ref;
@ -1427,7 +1427,7 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
auto ref_length = ref.length_in_code_units(); auto ref_length = ref.length_in_code_units();
// k. Set result to the string-concatenation of result and refReplacement. // k. Set result to the string-concatenation of result and refReplacement.
result.append(ref_replacement.data(), ref_replacement.length_in_code_points()); result.append(ref_replacement.span().data(), ref_replacement.length_in_code_units());
// j. Set templateRemainder to the substring of templateRemainder from refLength. // j. Set templateRemainder to the substring of templateRemainder from refLength.
// NOTE: We do this step last because refReplacement may point to templateRemainder. // NOTE: We do this step last because refReplacement may point to templateRemainder.

View file

@ -44,7 +44,7 @@ NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16View const& view)
{ {
Utf16Data string; Utf16Data string;
string.ensure_capacity(view.length_in_code_units()); string.ensure_capacity(view.length_in_code_units());
string.unchecked_append(view.data(), view.length_in_code_units()); string.unchecked_append(view.span().data(), view.length_in_code_units());
auto impl = create(move(string)); auto impl = create(move(string));
if (auto length_in_code_points = view.length_in_code_points_if_known(); length_in_code_points.has_value()) if (auto length_in_code_points = view.length_in_code_points_if_known(); length_in_code_points.has_value())

View file

@ -48,7 +48,7 @@ private:
mutable bool m_has_hash { false }; mutable bool m_has_hash { false };
mutable u32 m_hash { 0 }; mutable u32 m_hash { 0 };
Utf16Data m_string; Utf16Data m_string;
Utf16View m_cached_view { m_string.span() }; Utf16View m_cached_view { m_string };
}; };
} }

View file

@ -147,9 +147,8 @@ public:
return Vector<RegexStringView> { view }; return Vector<RegexStringView> { view };
Vector<RegexStringView> views; Vector<RegexStringView> views;
u16 newline = '\n';
while (!view.is_empty()) { while (!view.is_empty()) {
auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16)); auto position = view.find_code_unit_offset(u'\n');
if (!position.has_value()) if (!position.has_value())
break; break;
auto offset = position.value() / sizeof(u16); auto offset = position.value() / sizeof(u16);

View file

@ -159,8 +159,7 @@ String icu_string_to_string(icu::UnicodeString const& string)
String icu_string_to_string(UChar const* string, i32 length) String icu_string_to_string(UChar const* string, i32 length)
{ {
ReadonlySpan<u16> view { reinterpret_cast<u16 const*>(string), static_cast<size_t>(length) }; return MUST(Utf16View { string, static_cast<size_t>(length) }.to_utf8());
return MUST(Utf16View { view }.to_utf8());
} }
} }

View file

@ -75,7 +75,7 @@ public:
virtual void set_segmented_text(Utf16View const& text) override virtual void set_segmented_text(Utf16View const& text) override
{ {
m_segmented_text = icu::UnicodeString { text.data(), static_cast<i32>(text.length_in_code_units()) }; m_segmented_text = icu::UnicodeString { text.span().data(), static_cast<i32>(text.length_in_code_units()) };
m_segmenter->setText(m_segmented_text.get<icu::UnicodeString>()); m_segmenter->setText(m_segmented_text.get<icu::UnicodeString>());
} }

View file

@ -86,11 +86,12 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
auto before_data = utf16_view.substring_view(0, offset); auto before_data = utf16_view.substring_view(0, offset);
auto inserted_data_result = MUST(AK::utf8_to_utf16(data)); auto inserted_data_result = MUST(AK::utf8_to_utf16(data));
auto after_data = utf16_view.substring_view(offset + count); auto after_data = utf16_view.substring_view(offset + count);
Utf16Data full_data; Utf16Data full_data;
full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units()); full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units());
full_data.append(before_data.data(), before_data.length_in_code_units()); full_data.append(before_data.span().data(), before_data.length_in_code_units());
full_data.extend(inserted_data_result.data); full_data.extend(inserted_data_result.data);
full_data.append(after_data.data(), after_data.length_in_code_units()); full_data.append(after_data.span().data(), after_data.length_in_code_units());
Utf16View full_view { full_data }; Utf16View full_view { full_data };
bool characters_are_the_same = utf16_view == full_view; bool characters_are_the_same = utf16_view == full_view;

View file

@ -106,7 +106,7 @@ WebIDL::ExceptionOr<FileReader::Result> FileReader::blob_package_data(JS::Realm&
return JS::ArrayBuffer::create(realm, move(bytes)); return JS::ArrayBuffer::create(realm, move(bytes));
case Type::BinaryString: case Type::BinaryString:
// Return bytes as a binary string, in which every byte is represented by a code unit of equal value [0..255]. // Return bytes as a binary string, in which every byte is represented by a code unit of equal value [0..255].
Vector<u16> builder; Utf16Data builder;
builder.ensure_capacity(bytes.size()); builder.ensure_capacity(bytes.size());
for (auto byte : bytes.bytes()) for (auto byte : bytes.bytes())
builder.unchecked_append(byte); builder.unchecked_append(byte);

View file

@ -1,5 +1,5 @@
/* /*
* Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org> * Copyright (c) 2021-2025, Tim Flynn <trflynn89@ladybird.org>
* *
* SPDX-License-Identifier: BSD-2-Clause * SPDX-License-Identifier: BSD-2-Clause
*/ */
@ -60,8 +60,7 @@ TEST_CASE(encode_utf8)
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No)), utf8_string); EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No)), utf8_string);
} }
{ {
auto encoded = Array { (u16)0xd83d }; Utf16View view { u"\xd83d"sv };
Utf16View view { encoded };
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), "\xed\xa0\xbd"sv); EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), "\xed\xa0\xbd"sv);
EXPECT(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No).is_error()); EXPECT(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No).is_error());
} }
@ -69,11 +68,8 @@ TEST_CASE(encode_utf8)
TEST_CASE(decode_utf16) TEST_CASE(decode_utf16)
{ {
// Same string as the decode_utf8 test. Utf16View view { u"Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv };
auto encoded = Array { (u16)0x041f, 0x0440, 0x0438, 0x0432, 0x0435, 0x0442, 0x002c, 0x0020, 0x043c, 0x0438, 0x0440, 0x0021, 0x0020, 0xd83d, 0xde00, 0x0020, 0x03b3, 0x03b5, 0x03b9, 0x03ac, 0x0020, 0x03c3, 0x03bf, 0x03c5, 0x0020, 0x03ba, 0x03cc, 0x03c3, 0x03bc, 0x03bf, 0x03c2, 0x0020, 0x3053, 0x3093, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c }; EXPECT_EQ(view.length_in_code_units(), 39uz);
Utf16View view { encoded };
EXPECT_EQ(encoded.size(), view.length_in_code_units());
size_t valid_code_units = 0; size_t valid_code_units = 0;
EXPECT(view.validate(valid_code_units)); EXPECT(view.validate(valid_code_units));
@ -113,18 +109,18 @@ TEST_CASE(null_view)
TEST_CASE(utf16_literal) TEST_CASE(utf16_literal)
{ {
{ {
Utf16View view { u"" }; Utf16View view { u""sv };
EXPECT(view.validate()); EXPECT(view.validate());
EXPECT_EQ(view.length_in_code_units(), 0u); EXPECT_EQ(view.length_in_code_units(), 0u);
} }
{ {
Utf16View view { u"a" }; Utf16View view { u"a"sv };
EXPECT(view.validate()); EXPECT(view.validate());
EXPECT_EQ(view.length_in_code_units(), 1u); EXPECT_EQ(view.length_in_code_units(), 1u);
EXPECT_EQ(view.code_unit_at(0), 0x61u); EXPECT_EQ(view.code_unit_at(0), 0x61u);
} }
{ {
Utf16View view { u"abc" }; Utf16View view { u"abc"sv };
EXPECT(view.validate()); EXPECT(view.validate());
EXPECT_EQ(view.length_in_code_units(), 3u); EXPECT_EQ(view.length_in_code_units(), 3u);
EXPECT_EQ(view.code_unit_at(0), 0x61u); EXPECT_EQ(view.code_unit_at(0), 0x61u);
@ -132,7 +128,7 @@ TEST_CASE(utf16_literal)
EXPECT_EQ(view.code_unit_at(2), 0x63u); EXPECT_EQ(view.code_unit_at(2), 0x63u);
} }
{ {
Utf16View view { u"🙃" }; Utf16View view { u"🙃"sv };
EXPECT(view.validate()); EXPECT(view.validate());
EXPECT_EQ(view.length_in_code_units(), 2u); EXPECT_EQ(view.length_in_code_units(), 2u);
EXPECT_EQ(view.code_unit_at(0), 0xd83du); EXPECT_EQ(view.code_unit_at(0), 0xd83du);
@ -190,14 +186,14 @@ TEST_CASE(validate_invalid_utf16)
Utf16View invalid; Utf16View invalid;
{ {
// Lonely high surrogate. // Lonely high surrogate.
invalid = u"\xd800"; invalid = u"\xd800"sv;
EXPECT(!invalid.validate(valid_code_units)); EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz); EXPECT_EQ(valid_code_units, 0uz);
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
EXPECT_EQ(valid_code_units, 1uz); EXPECT_EQ(valid_code_units, 1uz);
invalid = u"\xdbff"; invalid = u"\xdbff"sv;
EXPECT(!invalid.validate(valid_code_units)); EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz); EXPECT_EQ(valid_code_units, 0uz);
@ -206,14 +202,14 @@ TEST_CASE(validate_invalid_utf16)
} }
{ {
// Lonely low surrogate. // Lonely low surrogate.
invalid = u"\xdc00"; invalid = u"\xdc00"sv;
EXPECT(!invalid.validate(valid_code_units)); EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz); EXPECT_EQ(valid_code_units, 0uz);
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
EXPECT_EQ(valid_code_units, 1uz); EXPECT_EQ(valid_code_units, 1uz);
invalid = u"\xdfff"; invalid = u"\xdfff"sv;
EXPECT(!invalid.validate(valid_code_units)); EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz); EXPECT_EQ(valid_code_units, 0uz);
@ -222,14 +218,14 @@ TEST_CASE(validate_invalid_utf16)
} }
{ {
// High surrogate followed by non-surrogate. // High surrogate followed by non-surrogate.
invalid = u"\xd800\x0000"; invalid = u"\xd800\x0000"sv;
EXPECT(!invalid.validate(valid_code_units)); EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz); EXPECT_EQ(valid_code_units, 0uz);
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
EXPECT_EQ(valid_code_units, 2uz); EXPECT_EQ(valid_code_units, 2uz);
invalid = u"\xd800\xe000"; invalid = u"\xd800\xe000"sv;
EXPECT(!invalid.validate(valid_code_units)); EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz); EXPECT_EQ(valid_code_units, 0uz);
@ -238,14 +234,14 @@ TEST_CASE(validate_invalid_utf16)
} }
{ {
// High surrogate followed by high surrogate. // High surrogate followed by high surrogate.
invalid = u"\xd800\xd800"; invalid = u"\xd800\xd800"sv;
EXPECT(!invalid.validate(valid_code_units)); EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz); EXPECT_EQ(valid_code_units, 0uz);
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
EXPECT_EQ(valid_code_units, 2uz); EXPECT_EQ(valid_code_units, 2uz);
invalid = u"\xd800\xdbff"; invalid = u"\xd800\xdbff"sv;
EXPECT(!invalid.validate(valid_code_units)); EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz); EXPECT_EQ(valid_code_units, 0uz);
@ -254,14 +250,14 @@ TEST_CASE(validate_invalid_utf16)
} }
{ {
// Valid UTF-16 followed by invalid code units. // Valid UTF-16 followed by invalid code units.
invalid = u"\x0041\x0041\xd800"; invalid = u"\x0041\x0041\xd800"sv;
EXPECT(!invalid.validate(valid_code_units)); EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 2uz); EXPECT_EQ(valid_code_units, 2uz);
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
EXPECT_EQ(valid_code_units, 3uz); EXPECT_EQ(valid_code_units, 3uz);
invalid = u"\x0041\x0041\xd800"; invalid = u"\x0041\x0041\xd800"sv;
EXPECT(!invalid.validate(valid_code_units)); EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 2uz); EXPECT_EQ(valid_code_units, 2uz);
@ -274,10 +270,8 @@ TEST_CASE(decode_invalid_utf16)
{ {
{ {
// Lonely high surrogate. // Lonely high surrogate.
auto invalid = Array { (u16)0x41, 0x42, 0xd800 }; Utf16View view { u"AB\xd800"sv };
EXPECT_EQ(view.length_in_code_units(), 3uz);
Utf16View view { invalid };
EXPECT_EQ(invalid.size(), view.length_in_code_units());
auto expected = Array { (u32)0x41, 0x42, 0xfffd }; auto expected = Array { (u32)0x41, 0x42, 0xfffd };
EXPECT_EQ(expected.size(), view.length_in_code_points()); EXPECT_EQ(expected.size(), view.length_in_code_points());
@ -290,10 +284,8 @@ TEST_CASE(decode_invalid_utf16)
} }
{ {
// Lonely low surrogate. // Lonely low surrogate.
auto invalid = Array { (u16)0x41, 0x42, 0xdc00 }; Utf16View view { u"AB\xdc00"sv };
EXPECT_EQ(view.length_in_code_units(), 3uz);
Utf16View view { invalid };
EXPECT_EQ(invalid.size(), view.length_in_code_units());
auto expected = Array { (u32)0x41, 0x42, 0xfffd }; auto expected = Array { (u32)0x41, 0x42, 0xfffd };
EXPECT_EQ(expected.size(), view.length_in_code_points()); EXPECT_EQ(expected.size(), view.length_in_code_points());
@ -306,10 +298,8 @@ TEST_CASE(decode_invalid_utf16)
} }
{ {
// High surrogate followed by non-surrogate. // High surrogate followed by non-surrogate.
auto invalid = Array { (u16)0x41, 0x42, 0xd800, 0 }; Utf16View view { u"AB\xd800\x0000"sv };
EXPECT_EQ(view.length_in_code_units(), 4uz);
Utf16View view { invalid };
EXPECT_EQ(invalid.size(), view.length_in_code_units());
auto expected = Array { (u32)0x41, 0x42, 0xfffd, 0 }; auto expected = Array { (u32)0x41, 0x42, 0xfffd, 0 };
EXPECT_EQ(expected.size(), view.length_in_code_points()); EXPECT_EQ(expected.size(), view.length_in_code_points());
@ -322,10 +312,8 @@ TEST_CASE(decode_invalid_utf16)
} }
{ {
// High surrogate followed by high surrogate. // High surrogate followed by high surrogate.
auto invalid = Array { (u16)0x41, 0x42, 0xd800, 0xd800 }; Utf16View view { u"AB\xd800\xd800"sv };
EXPECT_EQ(view.length_in_code_units(), 4uz);
Utf16View view { invalid };
EXPECT_EQ(invalid.size(), view.length_in_code_units());
auto expected = Array { (u32)0x41, 0x42, 0xfffd, 0xfffd }; auto expected = Array { (u32)0x41, 0x42, 0xfffd, 0xfffd };
EXPECT_EQ(expected.size(), view.length_in_code_points()); EXPECT_EQ(expected.size(), view.length_in_code_points());
@ -341,13 +329,13 @@ TEST_CASE(decode_invalid_utf16)
TEST_CASE(is_ascii) TEST_CASE(is_ascii)
{ {
EXPECT(Utf16View {}.is_ascii()); EXPECT(Utf16View {}.is_ascii());
EXPECT(Utf16View { u"a" }.is_ascii()); EXPECT(u"a"sv.is_ascii());
EXPECT(Utf16View { u"foo" }.is_ascii()); EXPECT(u"foo"sv.is_ascii());
EXPECT(Utf16View { u"foo\t\n\rbar\v\b123" }.is_ascii()); EXPECT(u"foo\t\n\rbar\v\b123"sv.is_ascii());
EXPECT(!Utf16View { u"😀" }.is_ascii()); EXPECT(!u"😀"sv.is_ascii());
EXPECT(!Utf16View { u"foo 😀" }.is_ascii()); EXPECT(!u"foo 😀"sv.is_ascii());
EXPECT(!Utf16View { u"😀 foo" }.is_ascii()); EXPECT(!u"😀 foo"sv.is_ascii());
} }
TEST_CASE(equals_ignoring_case) TEST_CASE(equals_ignoring_case)
@ -387,28 +375,28 @@ TEST_CASE(substring_view)
TEST_CASE(starts_with) TEST_CASE(starts_with)
{ {
EXPECT(Utf16View {}.starts_with(u"")); EXPECT(Utf16View {}.starts_with(u""sv));
EXPECT(!Utf16View {}.starts_with(u" ")); EXPECT(!Utf16View {}.starts_with(u" "sv));
EXPECT(Utf16View { u"a" }.starts_with(u"")); EXPECT(u"a"sv.starts_with(u""sv));
EXPECT(Utf16View { u"a" }.starts_with(u"a")); EXPECT(u"a"sv.starts_with(u"a"sv));
EXPECT(!Utf16View { u"a" }.starts_with(u"b")); EXPECT(!u"a"sv.starts_with(u"b"sv));
EXPECT(!Utf16View { u"a" }.starts_with(u"ab")); EXPECT(!u"a"sv.starts_with(u"ab"sv));
EXPECT(Utf16View { u"abc" }.starts_with(u"")); EXPECT(u"abc"sv.starts_with(u""sv));
EXPECT(Utf16View { u"abc" }.starts_with(u"a")); EXPECT(u"abc"sv.starts_with(u"a"sv));
EXPECT(Utf16View { u"abc" }.starts_with(u"ab")); EXPECT(u"abc"sv.starts_with(u"ab"sv));
EXPECT(Utf16View { u"abc" }.starts_with(u"abc")); EXPECT(u"abc"sv.starts_with(u"abc"sv));
EXPECT(!Utf16View { u"abc" }.starts_with(u"b")); EXPECT(!u"abc"sv.starts_with(u"b"sv));
EXPECT(!Utf16View { u"abc" }.starts_with(u"bc")); EXPECT(!u"abc"sv.starts_with(u"bc"sv));
auto emoji = Utf16View { u"😀🙃" }; auto emoji = u"😀🙃"sv;
EXPECT(emoji.starts_with(u"")); EXPECT(emoji.starts_with(u""sv));
EXPECT(emoji.starts_with(u"😀")); EXPECT(emoji.starts_with(u"😀"sv));
EXPECT(emoji.starts_with(u"😀🙃")); EXPECT(emoji.starts_with(u"😀🙃"sv));
EXPECT(!emoji.starts_with(u"a")); EXPECT(!emoji.starts_with(u"a"sv));
EXPECT(!emoji.starts_with(u"🙃")); EXPECT(!emoji.starts_with(u"🙃"sv));
} }
TEST_CASE(find_code_unit_offset) TEST_CASE(find_code_unit_offset)
@ -416,16 +404,16 @@ TEST_CASE(find_code_unit_offset)
auto conversion_result = MUST(AK::utf8_to_utf16("😀foo😀bar"sv)); auto conversion_result = MUST(AK::utf8_to_utf16("😀foo😀bar"sv));
Utf16View const view { conversion_result }; Utf16View const view { conversion_result };
EXPECT_EQ(0u, view.find_code_unit_offset(u"").value()); EXPECT_EQ(0u, view.find_code_unit_offset(u""sv).value());
EXPECT_EQ(4u, view.find_code_unit_offset(u"", 4).value()); EXPECT_EQ(4u, view.find_code_unit_offset(u""sv, 4).value());
EXPECT(!view.find_code_unit_offset(u"", 16).has_value()); EXPECT(!view.find_code_unit_offset(u""sv, 16).has_value());
EXPECT_EQ(0u, view.find_code_unit_offset(u"😀").value()); EXPECT_EQ(0u, view.find_code_unit_offset(u"😀"sv).value());
EXPECT_EQ(5u, view.find_code_unit_offset(u"😀", 1).value()); EXPECT_EQ(5u, view.find_code_unit_offset(u"😀"sv, 1).value());
EXPECT_EQ(2u, view.find_code_unit_offset(u"foo").value()); EXPECT_EQ(2u, view.find_code_unit_offset(u"foo"sv).value());
EXPECT_EQ(7u, view.find_code_unit_offset(u"bar").value()); EXPECT_EQ(7u, view.find_code_unit_offset(u"bar"sv).value());
EXPECT(!view.find_code_unit_offset(u"baz").has_value()); EXPECT(!view.find_code_unit_offset(u"baz"sv).has_value());
} }
TEST_CASE(find_code_unit_offset_ignoring_case) TEST_CASE(find_code_unit_offset_ignoring_case)
@ -433,13 +421,13 @@ TEST_CASE(find_code_unit_offset_ignoring_case)
auto conversion_result = MUST(AK::utf8_to_utf16("😀Foo😀Bar"sv)); auto conversion_result = MUST(AK::utf8_to_utf16("😀Foo😀Bar"sv));
Utf16View const view { conversion_result }; Utf16View const view { conversion_result };
EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u"").value()); EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u""sv).value());
EXPECT_EQ(4u, view.find_code_unit_offset_ignoring_case(u"", 4).value()); EXPECT_EQ(4u, view.find_code_unit_offset_ignoring_case(u""sv, 4).value());
EXPECT(!view.find_code_unit_offset_ignoring_case(u"", 16).has_value()); EXPECT(!view.find_code_unit_offset_ignoring_case(u""sv, 16).has_value());
EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u"😀").value()); EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u"😀"sv).value());
EXPECT_EQ(5u, view.find_code_unit_offset_ignoring_case(u"😀", 1).value()); EXPECT_EQ(5u, view.find_code_unit_offset_ignoring_case(u"😀"sv, 1).value());
EXPECT_EQ(2u, view.find_code_unit_offset_ignoring_case(u"foO").value()); EXPECT_EQ(2u, view.find_code_unit_offset_ignoring_case(u"foO"sv).value());
EXPECT_EQ(7u, view.find_code_unit_offset_ignoring_case(u"baR").value()); EXPECT_EQ(7u, view.find_code_unit_offset_ignoring_case(u"baR"sv).value());
EXPECT(!view.find_code_unit_offset_ignoring_case(u"baz").has_value()); EXPECT(!view.find_code_unit_offset_ignoring_case(u"baz"sv).has_value());
} }