mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-07-29 04:09:13 +00:00
AK+Everywhere: Prepare Utf16View for integration with a UTF-16 string
To prepare for an upcoming Utf16String, this migrates Utf16View to store its data as a char16_t. Most function definitions are moved inline and made constexpr. This also adds a UDL to construct a Utf16View from a string literal: auto string = u"hello"sv; This let's us remove the NTTP Utf16View constructor, as we have found that such constructors bloat binary size quite a bit.
This commit is contained in:
parent
c17b067e1d
commit
86b1c78c1a
Notes:
github-actions[bot]
2025-07-03 13:53:23 +00:00
Author: https://github.com/trflynn89
Commit: 86b1c78c1a
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5228
Reviewed-by: https://github.com/ADKaster ✅
Reviewed-by: https://github.com/shannonbooth
17 changed files with 406 additions and 421 deletions
|
@ -93,22 +93,7 @@ ErrorOr<String> String::from_utf16_be(ReadonlyBytes bytes)
|
||||||
|
|
||||||
ErrorOr<String> String::from_utf16(Utf16View const& utf16)
|
ErrorOr<String> String::from_utf16(Utf16View const& utf16)
|
||||||
{
|
{
|
||||||
if (!utf16.validate())
|
return utf16.to_utf8();
|
||||||
return Error::from_string_literal("String::from_utf16: Input was not valid UTF-16");
|
|
||||||
if (utf16.is_empty())
|
|
||||||
return String {};
|
|
||||||
|
|
||||||
String result;
|
|
||||||
|
|
||||||
auto utf8_length = simdutf::utf8_length_from_utf16(utf16.char_data(), utf16.length_in_code_units());
|
|
||||||
|
|
||||||
TRY(result.replace_with_new_string(utf8_length, [&](Bytes buffer) -> ErrorOr<void> {
|
|
||||||
[[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(utf16.char_data(), utf16.length_in_code_units(), reinterpret_cast<char*>(buffer.data()));
|
|
||||||
ASSERT(result == buffer.size());
|
|
||||||
return {};
|
|
||||||
}));
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<String> String::from_stream(Stream& stream, size_t byte_count)
|
ErrorOr<String> String::from_stream(Stream& stream, size_t byte_count)
|
||||||
|
|
|
@ -86,6 +86,12 @@ public:
|
||||||
return replace_with_new_string(byte_count, forward<Func>(callback));
|
return replace_with_new_string(byte_count, forward<Func>(callback));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template<typename Func>
|
||||||
|
ALWAYS_INLINE ErrorOr<void> replace_with_new_string(Badge<Utf16View>, size_t byte_count, Func&& callback)
|
||||||
|
{
|
||||||
|
return replace_with_new_string(byte_count, forward<Func>(callback));
|
||||||
|
}
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
template<typename Func>
|
template<typename Func>
|
||||||
ErrorOr<void> replace_with_new_string(size_t byte_count, Func&& callback)
|
ErrorOr<void> replace_with_new_string(size_t byte_count, Func&& callback)
|
||||||
|
|
|
@ -250,17 +250,17 @@ ErrorOr<void> StringBuilder::try_append(Utf16View const& utf16_view)
|
||||||
if (utf16_view.is_empty())
|
if (utf16_view.is_empty())
|
||||||
return {};
|
return {};
|
||||||
|
|
||||||
auto maximum_utf8_length = UnicodeUtils::maximum_utf8_length_from_utf16(utf16_view.span());
|
auto remaining_view = utf16_view.span();
|
||||||
|
auto maximum_utf8_length = UnicodeUtils::maximum_utf8_length_from_utf16(remaining_view);
|
||||||
|
|
||||||
// Possibly over-allocate a little to ensure we don't have to allocate later.
|
// Possibly over-allocate a little to ensure we don't have to allocate later.
|
||||||
TRY(will_append(maximum_utf8_length));
|
TRY(will_append(maximum_utf8_length));
|
||||||
|
|
||||||
Utf16View remaining_view = utf16_view;
|
|
||||||
for (;;) {
|
for (;;) {
|
||||||
auto uninitialized_data_pointer = static_cast<char*>(m_buffer.end_pointer());
|
auto* uninitialized_data_pointer = static_cast<char*>(m_buffer.end_pointer());
|
||||||
|
|
||||||
// Fast path.
|
// Fast path.
|
||||||
auto result = simdutf::convert_utf16_to_utf8_with_errors(remaining_view.char_data(), remaining_view.length_in_code_units(), uninitialized_data_pointer);
|
auto result = simdutf::convert_utf16_to_utf8_with_errors(remaining_view.data(), remaining_view.size(), uninitialized_data_pointer);
|
||||||
if (result.error == simdutf::SUCCESS) {
|
if (result.error == simdutf::SUCCESS) {
|
||||||
auto bytes_just_written = result.count;
|
auto bytes_just_written = result.count;
|
||||||
m_buffer.set_size(m_buffer.size() + bytes_just_written);
|
m_buffer.set_size(m_buffer.size() + bytes_just_written);
|
||||||
|
@ -269,13 +269,13 @@ ErrorOr<void> StringBuilder::try_append(Utf16View const& utf16_view)
|
||||||
|
|
||||||
// Slow path. Found unmatched surrogate code unit.
|
// Slow path. Found unmatched surrogate code unit.
|
||||||
auto first_invalid_code_unit = result.count;
|
auto first_invalid_code_unit = result.count;
|
||||||
ASSERT(first_invalid_code_unit < remaining_view.length_in_code_units());
|
ASSERT(first_invalid_code_unit < remaining_view.size());
|
||||||
|
|
||||||
// Unfortunately, `simdutf` does not tell us how many bytes it just wrote in case of an error, so we have to calculate it ourselves.
|
// Unfortunately, `simdutf` does not tell us how many bytes it just wrote in case of an error, so we have to calculate it ourselves.
|
||||||
auto bytes_just_written = simdutf::utf8_length_from_utf16(remaining_view.char_data(), first_invalid_code_unit);
|
auto bytes_just_written = simdutf::utf8_length_from_utf16(remaining_view.data(), first_invalid_code_unit);
|
||||||
|
|
||||||
do {
|
do {
|
||||||
auto code_unit = remaining_view.code_unit_at(first_invalid_code_unit++);
|
auto code_unit = remaining_view[first_invalid_code_unit++];
|
||||||
|
|
||||||
// Invalid surrogate code units are U+D800 - U+DFFF, so they are always encoded using 3 bytes.
|
// Invalid surrogate code units are U+D800 - U+DFFF, so they are always encoded using 3 bytes.
|
||||||
ASSERT(code_unit >= 0xD800 && code_unit <= 0xDFFF);
|
ASSERT(code_unit >= 0xD800 && code_unit <= 0xDFFF);
|
||||||
|
@ -283,11 +283,11 @@ ErrorOr<void> StringBuilder::try_append(Utf16View const& utf16_view)
|
||||||
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 12) & 0x0f) | 0xe0);
|
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 12) & 0x0f) | 0xe0);
|
||||||
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 6) & 0x3f) | 0x80);
|
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 6) & 0x3f) | 0x80);
|
||||||
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 0) & 0x3f) | 0x80);
|
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 0) & 0x3f) | 0x80);
|
||||||
} while (first_invalid_code_unit < remaining_view.length_in_code_units() && UnicodeUtils::is_utf16_low_surrogate(remaining_view.data()[first_invalid_code_unit]));
|
} while (first_invalid_code_unit < remaining_view.size() && UnicodeUtils::is_utf16_low_surrogate(remaining_view.data()[first_invalid_code_unit]));
|
||||||
|
|
||||||
// Code unit might no longer be invalid, retry on the remaining data.
|
// Code unit might no longer be invalid, retry on the remaining data.
|
||||||
m_buffer.set_size(m_buffer.size() + bytes_just_written);
|
m_buffer.set_size(m_buffer.size() + bytes_just_written);
|
||||||
remaining_view = remaining_view.substring_view(first_invalid_code_unit);
|
remaining_view = remaining_view.slice(first_invalid_code_unit);
|
||||||
}
|
}
|
||||||
|
|
||||||
return {};
|
return {};
|
||||||
|
|
|
@ -212,7 +212,7 @@ String StringView::to_ascii_lowercase_string() const
|
||||||
|
|
||||||
String result;
|
String result;
|
||||||
|
|
||||||
MUST(result.replace_with_new_string({}, length(), [&](Bytes buffer) -> ErrorOr<void> {
|
MUST(result.replace_with_new_string(Badge<StringView> {}, length(), [&](Bytes buffer) -> ErrorOr<void> {
|
||||||
for (auto [i, character] : enumerate(bytes()))
|
for (auto [i, character] : enumerate(bytes()))
|
||||||
buffer[i] = static_cast<u8>(AK::to_ascii_lowercase(character));
|
buffer[i] = static_cast<u8>(AK::to_ascii_lowercase(character));
|
||||||
return {};
|
return {};
|
||||||
|
@ -227,7 +227,7 @@ String StringView::to_ascii_uppercase_string() const
|
||||||
|
|
||||||
String result;
|
String result;
|
||||||
|
|
||||||
MUST(result.replace_with_new_string({}, length(), [&](Bytes buffer) -> ErrorOr<void> {
|
MUST(result.replace_with_new_string(Badge<StringView> {}, length(), [&](Bytes buffer) -> ErrorOr<void> {
|
||||||
for (auto [i, character] : enumerate(bytes()))
|
for (auto [i, character] : enumerate(bytes()))
|
||||||
buffer[i] = static_cast<u8>(AK::to_ascii_uppercase(character));
|
buffer[i] = static_cast<u8>(AK::to_ascii_uppercase(character));
|
||||||
return {};
|
return {};
|
||||||
|
|
|
@ -158,7 +158,7 @@ constexpr ErrorOr<size_t> try_code_point_to_utf16(u32 code_point, Callback callb
|
||||||
* Compute the maximum number of UTF-8 bytes needed to store a given UTF-16 string, accounting for unmatched UTF-16 surrogates.
|
* Compute the maximum number of UTF-8 bytes needed to store a given UTF-16 string, accounting for unmatched UTF-16 surrogates.
|
||||||
* This function will overcount by at most 33%; 2 bytes for every valid UTF-16 codepoint between U+100000 and U+10FFFF.
|
* This function will overcount by at most 33%; 2 bytes for every valid UTF-16 codepoint between U+100000 and U+10FFFF.
|
||||||
*/
|
*/
|
||||||
[[nodiscard]] static inline size_t maximum_utf8_length_from_utf16(ReadonlySpan<u16> code_units)
|
[[nodiscard]] static inline size_t maximum_utf8_length_from_utf16(ReadonlySpan<char16_t> code_units)
|
||||||
{
|
{
|
||||||
// # UTF-8 code point -> no. UTF-8 bytes needed
|
// # UTF-8 code point -> no. UTF-8 bytes needed
|
||||||
// U+0000 - U+007F => 1 UTF-8 bytes
|
// U+0000 - U+007F => 1 UTF-8 bytes
|
||||||
|
|
293
AK/Utf16View.cpp
293
AK/Utf16View.cpp
|
@ -80,77 +80,75 @@ ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const& utf32_view)
|
||||||
return Utf16ConversionResult { utf16_data, length };
|
return Utf16ConversionResult { utf16_data, length };
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool validate_utf16_le(ReadonlyBytes bytes)
|
||||||
|
{
|
||||||
|
return simdutf::validate_utf16le(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool validate_utf16_be(ReadonlyBytes bytes)
|
||||||
|
{
|
||||||
|
return simdutf::validate_utf16be(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
|
||||||
|
}
|
||||||
|
|
||||||
size_t utf16_code_unit_length_from_utf8(StringView string)
|
size_t utf16_code_unit_length_from_utf8(StringView string)
|
||||||
{
|
{
|
||||||
return simdutf::utf16_length_from_utf8(string.characters_without_null_termination(), string.length());
|
return simdutf::utf16_length_from_utf8(string.characters_without_null_termination(), string.length());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const
|
||||||
|
{
|
||||||
|
if (is_empty())
|
||||||
|
return String {};
|
||||||
|
if (!validate(allow_invalid_code_units))
|
||||||
|
return Error::from_string_literal("Input was not valid UTF-16");
|
||||||
|
|
||||||
|
if (allow_invalid_code_units == AllowInvalidCodeUnits::No) {
|
||||||
|
String result;
|
||||||
|
auto utf8_length = simdutf::utf8_length_from_utf16(m_string, length_in_code_units());
|
||||||
|
|
||||||
|
TRY(result.replace_with_new_string(Badge<Utf16View> {}, utf8_length, [&](Bytes buffer) -> ErrorOr<void> {
|
||||||
|
[[maybe_unused]] auto result = simdutf::convert_utf16_to_utf8(m_string, length_in_code_units(), reinterpret_cast<char*>(buffer.data()));
|
||||||
|
ASSERT(result == buffer.size());
|
||||||
|
return {};
|
||||||
|
}));
|
||||||
|
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
StringBuilder builder;
|
||||||
|
builder.append(*this);
|
||||||
|
return builder.to_string();
|
||||||
|
}
|
||||||
|
|
||||||
ErrorOr<ByteString> Utf16View::to_byte_string(AllowInvalidCodeUnits allow_invalid_code_units) const
|
ErrorOr<ByteString> Utf16View::to_byte_string(AllowInvalidCodeUnits allow_invalid_code_units) const
|
||||||
{
|
{
|
||||||
return TRY(to_utf8(allow_invalid_code_units)).to_byte_string();
|
return TRY(to_utf8(allow_invalid_code_units)).to_byte_string();
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const
|
|
||||||
{
|
|
||||||
if (allow_invalid_code_units == AllowInvalidCodeUnits::No)
|
|
||||||
return String::from_utf16(*this);
|
|
||||||
|
|
||||||
StringBuilder builder;
|
|
||||||
builder.append(*this);
|
|
||||||
return builder.to_string();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Utf16View::is_ascii() const
|
bool Utf16View::is_ascii() const
|
||||||
{
|
{
|
||||||
return simdutf::validate_ascii(reinterpret_cast<char const*>(m_code_units.data()), length_in_code_units() * sizeof(char16_t));
|
return simdutf::validate_ascii(reinterpret_cast<char const*>(m_string), length_in_code_units() * sizeof(char16_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Utf16View::length_in_code_points() const
|
bool Utf16View::validate(size_t& valid_code_units, AllowInvalidCodeUnits allow_invalid_code_units) const
|
||||||
{
|
{
|
||||||
if (m_length_in_code_points == NumericLimits<size_t>::max())
|
auto view = *this;
|
||||||
m_length_in_code_points = calculate_length_in_code_points();
|
valid_code_units = 0;
|
||||||
return m_length_in_code_points;
|
|
||||||
}
|
|
||||||
|
|
||||||
u16 Utf16View::code_unit_at(size_t index) const
|
while (!view.is_empty()) {
|
||||||
{
|
auto result = simdutf::validate_utf16_with_errors(view.m_string, view.length_in_code_units());
|
||||||
VERIFY(index < length_in_code_units());
|
valid_code_units += result.count;
|
||||||
return m_code_units[index];
|
|
||||||
}
|
|
||||||
|
|
||||||
u32 Utf16View::code_point_at(size_t index) const
|
if (result.error == simdutf::SUCCESS)
|
||||||
{
|
return true;
|
||||||
VERIFY(index < length_in_code_units());
|
if (allow_invalid_code_units == AllowInvalidCodeUnits::No || result.error != simdutf::SURROGATE)
|
||||||
|
return false;
|
||||||
|
|
||||||
u32 code_point = code_unit_at(index);
|
view = view.substring_view(result.count + 1);
|
||||||
if (!UnicodeUtils::is_utf16_high_surrogate(code_point) && !UnicodeUtils::is_utf16_low_surrogate(code_point))
|
++valid_code_units;
|
||||||
return code_point;
|
|
||||||
if (UnicodeUtils::is_utf16_low_surrogate(code_point) || (index + 1 == length_in_code_units()))
|
|
||||||
return code_point;
|
|
||||||
|
|
||||||
auto second = code_unit_at(index + 1);
|
|
||||||
if (!UnicodeUtils::is_utf16_low_surrogate(second))
|
|
||||||
return code_point;
|
|
||||||
|
|
||||||
return UnicodeUtils::decode_utf16_surrogate_pair(code_point, second);
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
|
|
||||||
{
|
|
||||||
if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit.
|
|
||||||
return code_unit_offset;
|
|
||||||
|
|
||||||
size_t code_point_offset = 0;
|
|
||||||
|
|
||||||
for (auto it = begin(); it != end(); ++it) {
|
|
||||||
if (code_unit_offset == 0)
|
|
||||||
return code_point_offset;
|
|
||||||
|
|
||||||
code_unit_offset -= it.length_in_code_units();
|
|
||||||
++code_point_offset;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return code_point_offset;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const
|
size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const
|
||||||
|
@ -171,19 +169,22 @@ size_t Utf16View::code_unit_offset_of(size_t code_point_offset) const
|
||||||
return code_unit_offset;
|
return code_unit_offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Utf16View::code_unit_offset_of(Utf16CodePointIterator const& it) const
|
size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
|
||||||
{
|
{
|
||||||
VERIFY(it.m_ptr >= begin_ptr());
|
if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit.
|
||||||
VERIFY(it.m_ptr <= end_ptr());
|
return code_unit_offset;
|
||||||
|
|
||||||
return it.m_ptr - begin_ptr();
|
size_t code_point_offset = 0;
|
||||||
}
|
|
||||||
|
|
||||||
Utf16View Utf16View::substring_view(size_t code_unit_offset, size_t code_unit_length) const
|
for (auto it = begin(); it != end(); ++it) {
|
||||||
{
|
if (code_unit_offset == 0)
|
||||||
VERIFY(!Checked<size_t>::addition_would_overflow(code_unit_offset, code_unit_length));
|
return code_point_offset;
|
||||||
|
|
||||||
return Utf16View { m_code_units.slice(code_unit_offset, code_unit_length) };
|
code_unit_offset -= it.length_in_code_units();
|
||||||
|
++code_point_offset;
|
||||||
|
}
|
||||||
|
|
||||||
|
return code_point_offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t code_point_length) const
|
Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t code_point_length) const
|
||||||
|
@ -194,7 +195,10 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
|
||||||
if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit.
|
if (length_in_code_points() == length_in_code_units()) // Fast path: all code points are one code unit.
|
||||||
return substring_view(code_point_offset, code_point_length);
|
return substring_view(code_point_offset, code_point_length);
|
||||||
|
|
||||||
auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) { return it.m_ptr - begin_ptr(); };
|
auto code_unit_offset_of = [&](Utf16CodePointIterator const& it) {
|
||||||
|
return it.m_iterator - m_string;
|
||||||
|
};
|
||||||
|
|
||||||
size_t code_point_index = 0;
|
size_t code_point_index = 0;
|
||||||
size_t code_unit_offset = 0;
|
size_t code_unit_offset = 0;
|
||||||
|
|
||||||
|
@ -213,101 +217,13 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
|
||||||
VERIFY_NOT_REACHED();
|
VERIFY_NOT_REACHED();
|
||||||
}
|
}
|
||||||
|
|
||||||
Optional<size_t> Utf16View::find_code_unit_offset(Utf16View const& needle, size_t start_offset) const
|
|
||||||
{
|
|
||||||
return m_code_units.index_of(needle.m_code_units, start_offset);
|
|
||||||
}
|
|
||||||
|
|
||||||
Optional<size_t> Utf16View::find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset) const
|
|
||||||
{
|
|
||||||
Checked maximum_offset { start_offset };
|
|
||||||
maximum_offset += needle.length_in_code_units();
|
|
||||||
if (maximum_offset.has_overflow() || maximum_offset.value() > length_in_code_units())
|
|
||||||
return {};
|
|
||||||
|
|
||||||
if (needle.is_empty())
|
|
||||||
return start_offset;
|
|
||||||
|
|
||||||
size_t index = start_offset;
|
|
||||||
while (index <= length_in_code_units() - needle.length_in_code_units()) {
|
|
||||||
Utf16View const slice { m_code_units.slice(index, needle.length_in_code_units()) };
|
|
||||||
if (slice.equals_ignoring_case(needle))
|
|
||||||
return index;
|
|
||||||
index += slice.begin().length_in_code_units();
|
|
||||||
}
|
|
||||||
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Utf16View::starts_with(Utf16View const& needle) const
|
|
||||||
{
|
|
||||||
if (needle.is_empty())
|
|
||||||
return true;
|
|
||||||
if (is_empty())
|
|
||||||
return false;
|
|
||||||
if (needle.length_in_code_units() > length_in_code_units())
|
|
||||||
return false;
|
|
||||||
if (begin_ptr() == needle.begin_ptr())
|
|
||||||
return true;
|
|
||||||
|
|
||||||
for (auto this_it = begin(), needle_it = needle.begin(); needle_it != needle.end(); ++needle_it, ++this_it) {
|
|
||||||
if (*this_it != *needle_it)
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
// https://infra.spec.whatwg.org/#code-unit-less-than
|
|
||||||
bool Utf16View::is_code_unit_less_than(Utf16View const& other) const
|
|
||||||
{
|
|
||||||
auto a = m_code_units;
|
|
||||||
auto b = other.m_code_units;
|
|
||||||
|
|
||||||
auto common_length = min(a.size(), b.size());
|
|
||||||
|
|
||||||
for (size_t position = 0; position < common_length; ++position) {
|
|
||||||
if (a[position] != b[position])
|
|
||||||
return a[position] < b[position];
|
|
||||||
}
|
|
||||||
|
|
||||||
return a.size() < b.size();
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Utf16View::validate(AllowInvalidCodeUnits allow_invalid_code_units) const
|
|
||||||
{
|
|
||||||
size_t valid_code_units = 0;
|
|
||||||
return validate(valid_code_units, allow_invalid_code_units);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Utf16View::validate(size_t& valid_code_units, AllowInvalidCodeUnits allow_invalid_code_units) const
|
|
||||||
{
|
|
||||||
auto view = *this;
|
|
||||||
valid_code_units = 0;
|
|
||||||
|
|
||||||
while (!view.is_empty()) {
|
|
||||||
auto result = simdutf::validate_utf16_with_errors(view.char_data(), view.length_in_code_units());
|
|
||||||
valid_code_units += result.count;
|
|
||||||
|
|
||||||
if (result.error == simdutf::SUCCESS)
|
|
||||||
return true;
|
|
||||||
if (allow_invalid_code_units == AllowInvalidCodeUnits::No || result.error != simdutf::SURROGATE)
|
|
||||||
return false;
|
|
||||||
|
|
||||||
view = view.substring_view(result.count + 1);
|
|
||||||
++valid_code_units;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t Utf16View::calculate_length_in_code_points() const
|
size_t Utf16View::calculate_length_in_code_points() const
|
||||||
{
|
{
|
||||||
// FIXME: simdutf's code point length method assumes valid UTF-16, whereas Utf16View uses U+FFFD as a replacement
|
// FIXME: simdutf's code point length method assumes valid UTF-16, whereas Utf16View uses U+FFFD as a replacement
|
||||||
// for invalid code points. If we change Utf16View to only accept valid encodings as an invariant, we can
|
// for invalid code points. If we change Utf16View to only accept valid encodings as an invariant, we can
|
||||||
// remove this branch.
|
// remove this branch.
|
||||||
if (validate()) [[likely]]
|
if (validate()) [[likely]]
|
||||||
return simdutf::count_utf16(char_data(), length_in_code_units());
|
return simdutf::count_utf16(m_string, length_in_code_units());
|
||||||
|
|
||||||
size_t code_points = 0;
|
size_t code_points = 0;
|
||||||
for ([[maybe_unused]] auto code_point : *this)
|
for ([[maybe_unused]] auto code_point : *this)
|
||||||
|
@ -315,81 +231,4 @@ size_t Utf16View::calculate_length_in_code_points() const
|
||||||
return code_points;
|
return code_points;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Utf16View::equals_ignoring_case(Utf16View const& other) const
|
|
||||||
{
|
|
||||||
if (length_in_code_units() != other.length_in_code_units())
|
|
||||||
return false;
|
|
||||||
|
|
||||||
for (size_t i = 0; i < length_in_code_units(); ++i) {
|
|
||||||
// FIXME: Handle non-ASCII case insensitive comparisons.
|
|
||||||
if (to_ascii_lowercase(m_code_units[i]) != to_ascii_lowercase(other.m_code_units[i]))
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
Utf16CodePointIterator& Utf16CodePointIterator::operator++()
|
|
||||||
{
|
|
||||||
size_t code_units = length_in_code_units();
|
|
||||||
|
|
||||||
if (code_units > m_remaining_code_units) {
|
|
||||||
// If there aren't enough code units remaining, skip to the end.
|
|
||||||
m_ptr += m_remaining_code_units;
|
|
||||||
m_remaining_code_units = 0;
|
|
||||||
} else {
|
|
||||||
m_ptr += code_units;
|
|
||||||
m_remaining_code_units -= code_units;
|
|
||||||
}
|
|
||||||
|
|
||||||
return *this;
|
|
||||||
}
|
|
||||||
|
|
||||||
u32 Utf16CodePointIterator::operator*() const
|
|
||||||
{
|
|
||||||
VERIFY(m_remaining_code_units > 0);
|
|
||||||
|
|
||||||
// rfc2781, 2.2 Decoding UTF-16
|
|
||||||
// 1) If W1 < 0xD800 or W1 > 0xDFFF, the character value U is the value
|
|
||||||
// of W1. Terminate.
|
|
||||||
// 2) Determine if W1 is between 0xD800 and 0xDBFF. If not, the sequence
|
|
||||||
// is in error and no valid character can be obtained using W1.
|
|
||||||
// Terminate.
|
|
||||||
// 3) If there is no W2 (that is, the sequence ends with W1), or if W2
|
|
||||||
// is not between 0xDC00 and 0xDFFF, the sequence is in error.
|
|
||||||
// Terminate.
|
|
||||||
// 4) Construct a 20-bit unsigned integer U', taking the 10 low-order
|
|
||||||
// bits of W1 as its 10 high-order bits and the 10 low-order bits of
|
|
||||||
// W2 as its 10 low-order bits.
|
|
||||||
// 5) Add 0x10000 to U' to obtain the character value U. Terminate.
|
|
||||||
|
|
||||||
auto code_unit = *m_ptr;
|
|
||||||
|
|
||||||
if (UnicodeUtils::is_utf16_high_surrogate(code_unit)) {
|
|
||||||
if (m_remaining_code_units > 1) {
|
|
||||||
auto next_code_unit = *(m_ptr + 1);
|
|
||||||
|
|
||||||
if (UnicodeUtils::is_utf16_low_surrogate(next_code_unit))
|
|
||||||
return UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit);
|
|
||||||
}
|
|
||||||
|
|
||||||
return UnicodeUtils::REPLACEMENT_CODE_POINT;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (UnicodeUtils::is_utf16_low_surrogate(code_unit))
|
|
||||||
return UnicodeUtils::REPLACEMENT_CODE_POINT;
|
|
||||||
|
|
||||||
return static_cast<u32>(code_unit);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool validate_utf16_le(ReadonlyBytes bytes)
|
|
||||||
{
|
|
||||||
return simdutf::validate_utf16le(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool validate_utf16_be(ReadonlyBytes bytes)
|
|
||||||
{
|
|
||||||
return simdutf::validate_utf16be(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
294
AK/Utf16View.h
294
AK/Utf16View.h
|
@ -10,6 +10,7 @@
|
||||||
#include <AK/Error.h>
|
#include <AK/Error.h>
|
||||||
#include <AK/Format.h>
|
#include <AK/Format.h>
|
||||||
#include <AK/Forward.h>
|
#include <AK/Forward.h>
|
||||||
|
#include <AK/MemMem.h>
|
||||||
#include <AK/Optional.h>
|
#include <AK/Optional.h>
|
||||||
#include <AK/Span.h>
|
#include <AK/Span.h>
|
||||||
#include <AK/String.h>
|
#include <AK/String.h>
|
||||||
|
@ -21,7 +22,7 @@
|
||||||
|
|
||||||
namespace AK {
|
namespace AK {
|
||||||
|
|
||||||
using Utf16Data = Vector<u16, 1>;
|
using Utf16Data = Vector<char16_t, 1>;
|
||||||
|
|
||||||
struct Utf16ConversionResult {
|
struct Utf16ConversionResult {
|
||||||
Utf16Data data;
|
Utf16Data data;
|
||||||
|
@ -36,8 +37,6 @@ ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const&);
|
||||||
|
|
||||||
size_t utf16_code_unit_length_from_utf8(StringView);
|
size_t utf16_code_unit_length_from_utf8(StringView);
|
||||||
|
|
||||||
class Utf16View;
|
|
||||||
|
|
||||||
class Utf16CodePointIterator {
|
class Utf16CodePointIterator {
|
||||||
friend class Utf16View;
|
friend class Utf16View;
|
||||||
|
|
||||||
|
@ -45,27 +44,57 @@ public:
|
||||||
Utf16CodePointIterator() = default;
|
Utf16CodePointIterator() = default;
|
||||||
~Utf16CodePointIterator() = default;
|
~Utf16CodePointIterator() = default;
|
||||||
|
|
||||||
bool operator==(Utf16CodePointIterator const& other) const
|
constexpr Utf16CodePointIterator& operator++()
|
||||||
{
|
{
|
||||||
return (m_ptr == other.m_ptr) && (m_remaining_code_units == other.m_remaining_code_units);
|
VERIFY(m_remaining_code_units > 0);
|
||||||
|
|
||||||
|
auto length = min(length_in_code_units(), m_remaining_code_units);
|
||||||
|
m_iterator += length;
|
||||||
|
m_remaining_code_units -= length;
|
||||||
|
|
||||||
|
return *this;
|
||||||
}
|
}
|
||||||
|
|
||||||
Utf16CodePointIterator& operator++();
|
constexpr u32 operator*() const
|
||||||
u32 operator*() const;
|
{
|
||||||
|
VERIFY(m_remaining_code_units > 0);
|
||||||
|
auto code_unit = *m_iterator;
|
||||||
|
|
||||||
size_t length_in_code_units() const
|
if (UnicodeUtils::is_utf16_high_surrogate(code_unit)) {
|
||||||
|
if (m_remaining_code_units > 1) {
|
||||||
|
auto next_code_unit = *(m_iterator + 1);
|
||||||
|
|
||||||
|
if (UnicodeUtils::is_utf16_low_surrogate(next_code_unit))
|
||||||
|
return UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit);
|
||||||
|
}
|
||||||
|
|
||||||
|
return UnicodeUtils::REPLACEMENT_CODE_POINT;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (UnicodeUtils::is_utf16_low_surrogate(code_unit))
|
||||||
|
return UnicodeUtils::REPLACEMENT_CODE_POINT;
|
||||||
|
|
||||||
|
return static_cast<u32>(code_unit);
|
||||||
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] constexpr bool operator==(Utf16CodePointIterator const& other) const
|
||||||
|
{
|
||||||
|
return (m_iterator == other.m_iterator) && (m_remaining_code_units == other.m_remaining_code_units);
|
||||||
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] constexpr size_t length_in_code_units() const
|
||||||
{
|
{
|
||||||
return UnicodeUtils::code_unit_length_for_code_point(**this);
|
return UnicodeUtils::code_unit_length_for_code_point(**this);
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Utf16CodePointIterator(u16 const* ptr, size_t length)
|
Utf16CodePointIterator(char16_t const* ptr, size_t length)
|
||||||
: m_ptr(ptr)
|
: m_iterator(ptr)
|
||||||
, m_remaining_code_units(length)
|
, m_remaining_code_units(length)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
u16 const* m_ptr { nullptr };
|
char16_t const* m_iterator { nullptr };
|
||||||
size_t m_remaining_code_units { 0 };
|
size_t m_remaining_code_units { 0 };
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -73,101 +102,233 @@ class Utf16View {
|
||||||
public:
|
public:
|
||||||
using Iterator = Utf16CodePointIterator;
|
using Iterator = Utf16CodePointIterator;
|
||||||
|
|
||||||
|
enum class AllowInvalidCodeUnits {
|
||||||
|
No,
|
||||||
|
Yes,
|
||||||
|
};
|
||||||
|
|
||||||
Utf16View() = default;
|
Utf16View() = default;
|
||||||
~Utf16View() = default;
|
~Utf16View() = default;
|
||||||
|
|
||||||
explicit Utf16View(ReadonlySpan<u16> code_units)
|
constexpr Utf16View(char16_t const* string, size_t length_in_code_units)
|
||||||
: m_code_units(code_units)
|
: m_string(string)
|
||||||
|
, m_length_in_code_units(length_in_code_units)
|
||||||
|
{
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr Utf16View(Utf16Data const& string)
|
||||||
|
: m_string(string.data())
|
||||||
|
, m_length_in_code_units(string.size())
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
Utf16View(Utf16ConversionResult&&) = delete;
|
Utf16View(Utf16ConversionResult&&) = delete;
|
||||||
explicit Utf16View(Utf16ConversionResult const& conversion_result)
|
explicit Utf16View(Utf16ConversionResult const& conversion_result)
|
||||||
: m_code_units(conversion_result.data)
|
: m_string(conversion_result.data.data())
|
||||||
|
, m_length_in_code_units(conversion_result.data.size())
|
||||||
, m_length_in_code_points(conversion_result.code_point_count)
|
, m_length_in_code_points(conversion_result.code_point_count)
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
template<size_t Size>
|
ErrorOr<String> to_utf8(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
|
||||||
Utf16View(char16_t const (&code_units)[Size])
|
ErrorOr<ByteString> to_byte_string(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
|
||||||
: m_code_units(
|
|
||||||
reinterpret_cast<u16 const*>(&code_units[0]),
|
[[nodiscard]] constexpr ReadonlySpan<char16_t> span() const
|
||||||
code_units[Size - 1] == u'\0' ? Size - 1 : Size)
|
|
||||||
{
|
{
|
||||||
|
return { m_string, length_in_code_units() };
|
||||||
}
|
}
|
||||||
|
|
||||||
bool operator==(Utf16View const& other) const { return m_code_units == other.m_code_units; }
|
[[nodiscard]] constexpr bool operator==(Utf16View const& other) const
|
||||||
|
{
|
||||||
|
if (length_in_code_units() != other.length_in_code_units())
|
||||||
|
return false;
|
||||||
|
return TypedTransfer<char16_t>::compare(m_string, other.m_string, length_in_code_units());
|
||||||
|
}
|
||||||
|
|
||||||
enum class AllowInvalidCodeUnits {
|
[[nodiscard]] constexpr bool equals_ignoring_case(Utf16View const& other) const
|
||||||
Yes,
|
{
|
||||||
No,
|
// FIXME: Handle non-ASCII case insensitive comparisons.
|
||||||
};
|
return equals_ignoring_ascii_case(other);
|
||||||
|
}
|
||||||
|
|
||||||
ErrorOr<ByteString> to_byte_string(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
|
[[nodiscard]] constexpr bool equals_ignoring_ascii_case(Utf16View const& other) const
|
||||||
ErrorOr<String> to_utf8(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
|
{
|
||||||
|
if (length_in_code_units() != other.length_in_code_units())
|
||||||
|
return false;
|
||||||
|
|
||||||
void unsafe_set_code_point_length(size_t length) const { m_length_in_code_points = length; }
|
for (size_t i = 0; i < length_in_code_units(); ++i) {
|
||||||
|
if (to_ascii_lowercase(code_unit_at(i)) != to_ascii_lowercase(other.code_unit_at(i)))
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
bool is_null() const { return m_code_units.is_null(); }
|
return true;
|
||||||
bool is_empty() const { return m_code_units.is_empty(); }
|
}
|
||||||
bool is_ascii() const;
|
|
||||||
|
|
||||||
size_t length_in_code_units() const { return m_code_units.size(); }
|
template<typename... Ts>
|
||||||
size_t length_in_code_points() const;
|
[[nodiscard]] constexpr bool is_one_of(Ts&&... strings) const
|
||||||
|
{
|
||||||
|
return (this->operator==(forward<Ts>(strings)) || ...);
|
||||||
|
}
|
||||||
|
|
||||||
Optional<size_t> length_in_code_points_if_known() const
|
template<typename... Ts>
|
||||||
|
[[nodiscard]] constexpr bool is_one_of_ignoring_ascii_case(Ts&&... strings) const
|
||||||
|
{
|
||||||
|
return (this->equals_ignoring_ascii_case(forward<Ts>(strings)) || ...);
|
||||||
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] constexpr u32 hash() const
|
||||||
|
{
|
||||||
|
if (is_empty())
|
||||||
|
return 0;
|
||||||
|
return string_hash(reinterpret_cast<char const*>(m_string), length_in_code_units() * sizeof(char16_t));
|
||||||
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] constexpr bool is_null() const { return m_string == nullptr; }
|
||||||
|
[[nodiscard]] constexpr bool is_empty() const { return length_in_code_units() == 0; }
|
||||||
|
[[nodiscard]] bool is_ascii() const;
|
||||||
|
|
||||||
|
[[nodiscard]] ALWAYS_INLINE bool validate(AllowInvalidCodeUnits allow_invalid_code_units = AllowInvalidCodeUnits::No) const
|
||||||
|
{
|
||||||
|
size_t valid_code_units = 0;
|
||||||
|
return validate(valid_code_units, allow_invalid_code_units);
|
||||||
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] bool validate(size_t& valid_code_units, AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
|
||||||
|
|
||||||
|
[[nodiscard]] constexpr size_t length_in_code_units() const { return m_length_in_code_units; }
|
||||||
|
|
||||||
|
[[nodiscard]] ALWAYS_INLINE size_t length_in_code_points() const
|
||||||
|
{
|
||||||
|
if (m_length_in_code_points == NumericLimits<size_t>::max())
|
||||||
|
m_length_in_code_points = calculate_length_in_code_points();
|
||||||
|
return m_length_in_code_points;
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr Optional<size_t> length_in_code_points_if_known() const
|
||||||
{
|
{
|
||||||
if (m_length_in_code_points == NumericLimits<size_t>::max())
|
if (m_length_in_code_points == NumericLimits<size_t>::max())
|
||||||
return {};
|
return {};
|
||||||
return m_length_in_code_points;
|
return m_length_in_code_points;
|
||||||
}
|
}
|
||||||
|
|
||||||
u32 hash() const
|
constexpr void unsafe_set_code_point_length(size_t length) const { m_length_in_code_points = length; }
|
||||||
|
|
||||||
|
[[nodiscard]] constexpr char16_t code_unit_at(size_t index) const
|
||||||
{
|
{
|
||||||
if (is_empty())
|
VERIFY(index < length_in_code_units());
|
||||||
return 0;
|
return m_string[index];
|
||||||
return string_hash(reinterpret_cast<char const*>(m_code_units.data()), m_code_units.size() * sizeof(u16));
|
|
||||||
}
|
}
|
||||||
|
|
||||||
Utf16CodePointIterator begin() const { return { begin_ptr(), m_code_units.size() }; }
|
[[nodiscard]] constexpr u32 code_point_at(size_t index) const
|
||||||
Utf16CodePointIterator end() const { return { end_ptr(), 0 }; }
|
{
|
||||||
|
VERIFY(index < length_in_code_units());
|
||||||
|
u32 code_point = code_unit_at(index);
|
||||||
|
|
||||||
u16 const* data() const { return m_code_units.data(); }
|
if (!UnicodeUtils::is_utf16_high_surrogate(code_point) && !UnicodeUtils::is_utf16_low_surrogate(code_point))
|
||||||
char16_t const* char_data() const { return reinterpret_cast<char16_t const*>(data()); }
|
return code_point;
|
||||||
|
if (UnicodeUtils::is_utf16_low_surrogate(code_point) || (index + 1 == length_in_code_units()))
|
||||||
|
return code_point;
|
||||||
|
|
||||||
ReadonlySpan<u16> span() const { return m_code_units; }
|
auto second = code_unit_at(index + 1);
|
||||||
|
if (!UnicodeUtils::is_utf16_low_surrogate(second))
|
||||||
|
return code_point;
|
||||||
|
|
||||||
u16 code_unit_at(size_t index) const;
|
return UnicodeUtils::decode_utf16_surrogate_pair(code_point, second);
|
||||||
u32 code_point_at(size_t index) const;
|
}
|
||||||
|
|
||||||
size_t code_point_offset_of(size_t code_unit_offset) const;
|
[[nodiscard]] size_t code_unit_offset_of(size_t code_point_offset) const;
|
||||||
size_t code_unit_offset_of(size_t code_point_offset) const;
|
[[nodiscard]] size_t code_point_offset_of(size_t code_unit_offset) const;
|
||||||
size_t code_unit_offset_of(Utf16CodePointIterator const&) const;
|
|
||||||
|
|
||||||
Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const;
|
[[nodiscard]] constexpr Utf16CodePointIterator begin() const
|
||||||
Utf16View substring_view(size_t code_unit_offset) const { return substring_view(code_unit_offset, length_in_code_units() - code_unit_offset); }
|
{
|
||||||
|
return { m_string, length_in_code_units() };
|
||||||
|
}
|
||||||
|
|
||||||
Utf16View unicode_substring_view(size_t code_point_offset, size_t code_point_length) const;
|
[[nodiscard]] constexpr Utf16CodePointIterator end() const
|
||||||
Utf16View unicode_substring_view(size_t code_point_offset) const { return unicode_substring_view(code_point_offset, length_in_code_points() - code_point_offset); }
|
{
|
||||||
|
return { m_string + length_in_code_units(), 0 };
|
||||||
|
}
|
||||||
|
|
||||||
Optional<size_t> find_code_unit_offset(Utf16View const& needle, size_t start_offset = 0) const;
|
[[nodiscard]] constexpr Utf16View substring_view(size_t code_unit_offset, size_t code_unit_length) const
|
||||||
Optional<size_t> find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset = 0) const;
|
{
|
||||||
|
VERIFY(code_unit_offset + code_unit_length <= length_in_code_units());
|
||||||
|
return { m_string + code_unit_offset, code_unit_length };
|
||||||
|
}
|
||||||
|
|
||||||
bool starts_with(Utf16View const&) const;
|
[[nodiscard]] constexpr Utf16View substring_view(size_t code_unit_offset) const { return substring_view(code_unit_offset, length_in_code_units() - code_unit_offset); }
|
||||||
bool is_code_unit_less_than(Utf16View const& other) const;
|
|
||||||
|
|
||||||
bool validate(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
|
[[nodiscard]] Utf16View unicode_substring_view(size_t code_point_offset, size_t code_point_length) const;
|
||||||
bool validate(size_t& valid_code_units, AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
|
[[nodiscard]] Utf16View unicode_substring_view(size_t code_point_offset) const { return unicode_substring_view(code_point_offset, length_in_code_points() - code_point_offset); }
|
||||||
|
|
||||||
bool equals_ignoring_case(Utf16View const&) const;
|
constexpr Optional<size_t> find_code_unit_offset(char16_t needle, size_t start_offset = 0) const
|
||||||
|
{
|
||||||
|
if (start_offset >= length_in_code_units())
|
||||||
|
return {};
|
||||||
|
return AK::memmem_optional(m_string + start_offset, (length_in_code_units() - start_offset) * sizeof(char16_t), &needle, sizeof(needle));
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr Optional<size_t> find_code_unit_offset(Utf16View const& needle, size_t start_offset = 0) const
|
||||||
|
{
|
||||||
|
return span().index_of(needle.span(), start_offset);
|
||||||
|
}
|
||||||
|
|
||||||
|
constexpr Optional<size_t> find_code_unit_offset_ignoring_case(Utf16View const& needle, size_t start_offset = 0) const
|
||||||
|
{
|
||||||
|
Checked maximum_offset { start_offset };
|
||||||
|
maximum_offset += needle.length_in_code_units();
|
||||||
|
if (maximum_offset.has_overflow() || maximum_offset.value() > length_in_code_units())
|
||||||
|
return {};
|
||||||
|
|
||||||
|
if (needle.is_empty())
|
||||||
|
return start_offset;
|
||||||
|
|
||||||
|
size_t index = start_offset;
|
||||||
|
while (index <= length_in_code_units() - needle.length_in_code_units()) {
|
||||||
|
auto slice = substring_view(index, needle.length_in_code_units());
|
||||||
|
if (slice.equals_ignoring_case(needle))
|
||||||
|
return index;
|
||||||
|
|
||||||
|
index += slice.begin().length_in_code_units();
|
||||||
|
}
|
||||||
|
|
||||||
|
return {};
|
||||||
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] constexpr bool starts_with(Utf16View const& needle) const
|
||||||
|
{
|
||||||
|
if (needle.is_empty())
|
||||||
|
return true;
|
||||||
|
if (is_empty())
|
||||||
|
return false;
|
||||||
|
if (needle.length_in_code_units() > length_in_code_units())
|
||||||
|
return false;
|
||||||
|
|
||||||
|
if (m_string == needle.m_string)
|
||||||
|
return true;
|
||||||
|
return span().starts_with(needle.span());
|
||||||
|
}
|
||||||
|
|
||||||
|
// https://infra.spec.whatwg.org/#code-unit-less-than
|
||||||
|
[[nodiscard]] constexpr bool is_code_unit_less_than(Utf16View const& other) const
|
||||||
|
{
|
||||||
|
auto common_length = min(length_in_code_units(), other.length_in_code_units());
|
||||||
|
|
||||||
|
for (size_t position = 0; position < common_length; ++position) {
|
||||||
|
auto this_code_unit = code_unit_at(position);
|
||||||
|
auto other_code_unit = other.code_unit_at(position);
|
||||||
|
|
||||||
|
if (this_code_unit != other_code_unit)
|
||||||
|
return this_code_unit < other_code_unit;
|
||||||
|
}
|
||||||
|
|
||||||
|
return length_in_code_units() < other.length_in_code_units();
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
u16 const* begin_ptr() const { return m_code_units.data(); }
|
[[nodiscard]] size_t calculate_length_in_code_points() const;
|
||||||
u16 const* end_ptr() const { return begin_ptr() + m_code_units.size(); }
|
|
||||||
|
|
||||||
size_t calculate_length_in_code_points() const;
|
char16_t const* m_string { nullptr };
|
||||||
|
size_t m_length_in_code_units { 0 };
|
||||||
ReadonlySpan<u16> m_code_units;
|
|
||||||
mutable size_t m_length_in_code_points { NumericLimits<size_t>::max() };
|
mutable size_t m_length_in_code_points { NumericLimits<size_t>::max() };
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -188,6 +349,13 @@ struct Traits<Utf16View> : public DefaultTraits<Utf16View> {
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] ALWAYS_INLINE AK_STRING_VIEW_LITERAL_CONSTEVAL AK::Utf16View operator""sv(char16_t const* string, size_t length)
|
||||||
|
{
|
||||||
|
AK::Utf16View view { string, length };
|
||||||
|
ASSERT(view.validate());
|
||||||
|
return view;
|
||||||
|
}
|
||||||
|
|
||||||
#if USING_AK_GLOBALLY
|
#if USING_AK_GLOBALLY
|
||||||
using AK::Utf16Data;
|
using AK::Utf16Data;
|
||||||
using AK::Utf16View;
|
using AK::Utf16View;
|
||||||
|
|
|
@ -111,7 +111,7 @@ ErrorOr<String> Process::get_name()
|
||||||
if (!length)
|
if (!length)
|
||||||
return Error::from_windows_error();
|
return Error::from_windows_error();
|
||||||
|
|
||||||
return String::from_utf16(Utf16View { { (u16*)path, length } });
|
return String::from_utf16(Utf16View { reinterpret_cast<char16_t const*>(path), length });
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<void> Process::set_name(StringView, SetThreadName)
|
ErrorOr<void> Process::set_name(StringView, SetThreadName)
|
||||||
|
|
|
@ -1271,33 +1271,33 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
|
||||||
Optional<Utf16String> capture_string;
|
Optional<Utf16String> capture_string;
|
||||||
|
|
||||||
// b. If templateRemainder starts with "$$", then
|
// b. If templateRemainder starts with "$$", then
|
||||||
if (template_remainder.starts_with(u"$$")) {
|
if (template_remainder.starts_with(u"$$"sv)) {
|
||||||
// i. Let ref be "$$".
|
// i. Let ref be "$$".
|
||||||
ref = u"$$";
|
ref = u"$$"sv;
|
||||||
|
|
||||||
// ii. Let refReplacement be "$".
|
// ii. Let refReplacement be "$".
|
||||||
ref_replacement = u"$";
|
ref_replacement = u"$"sv;
|
||||||
}
|
}
|
||||||
// c. Else if templateRemainder starts with "$`", then
|
// c. Else if templateRemainder starts with "$`", then
|
||||||
else if (template_remainder.starts_with(u"$`")) {
|
else if (template_remainder.starts_with(u"$`"sv)) {
|
||||||
// i. Let ref be "$`".
|
// i. Let ref be "$`".
|
||||||
ref = u"$`";
|
ref = u"$`"sv;
|
||||||
|
|
||||||
// ii. Let refReplacement be the substring of str from 0 to position.
|
// ii. Let refReplacement be the substring of str from 0 to position.
|
||||||
ref_replacement = str.substring_view(0, position);
|
ref_replacement = str.substring_view(0, position);
|
||||||
}
|
}
|
||||||
// d. Else if templateRemainder starts with "$&", then
|
// d. Else if templateRemainder starts with "$&", then
|
||||||
else if (template_remainder.starts_with(u"$&")) {
|
else if (template_remainder.starts_with(u"$&"sv)) {
|
||||||
// i. Let ref be "$&".
|
// i. Let ref be "$&".
|
||||||
ref = u"$&";
|
ref = u"$&"sv;
|
||||||
|
|
||||||
// ii. Let refReplacement be matched.
|
// ii. Let refReplacement be matched.
|
||||||
ref_replacement = matched;
|
ref_replacement = matched;
|
||||||
}
|
}
|
||||||
// e. Else if templateRemainder starts with "$'" (0x0024 (DOLLAR SIGN) followed by 0x0027 (APOSTROPHE)), then
|
// e. Else if templateRemainder starts with "$'" (0x0024 (DOLLAR SIGN) followed by 0x0027 (APOSTROPHE)), then
|
||||||
else if (template_remainder.starts_with(u"$'")) {
|
else if (template_remainder.starts_with(u"$'"sv)) {
|
||||||
// i. Let ref be "$'".
|
// i. Let ref be "$'".
|
||||||
ref = u"$'";
|
ref = u"$'"sv;
|
||||||
|
|
||||||
// ii. Let matchLength be the length of matched.
|
// ii. Let matchLength be the length of matched.
|
||||||
auto match_length = matched.length_in_code_units();
|
auto match_length = matched.length_in_code_units();
|
||||||
|
@ -1311,7 +1311,7 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
|
||||||
// v. NOTE: tailPos can exceed stringLength only if this abstract operation was invoked by a call to the intrinsic @@replace method of %RegExp.prototype% on an object whose "exec" property is not the intrinsic %RegExp.prototype.exec%.
|
// v. NOTE: tailPos can exceed stringLength only if this abstract operation was invoked by a call to the intrinsic @@replace method of %RegExp.prototype% on an object whose "exec" property is not the intrinsic %RegExp.prototype.exec%.
|
||||||
}
|
}
|
||||||
// f. Else if templateRemainder starts with "$" followed by 1 or more decimal digits, then
|
// f. Else if templateRemainder starts with "$" followed by 1 or more decimal digits, then
|
||||||
else if (template_remainder.starts_with(u"$") && template_remainder.length_in_code_units() > 1 && is_ascii_digit(template_remainder.code_unit_at(1))) {
|
else if (template_remainder.starts_with(u"$"sv) && template_remainder.length_in_code_units() > 1 && is_ascii_digit(template_remainder.code_unit_at(1))) {
|
||||||
// i. If templateRemainder starts with "$" followed by 2 or more decimal digits, let digitCount be 2. Otherwise, let digitCount be 1.
|
// i. If templateRemainder starts with "$" followed by 2 or more decimal digits, let digitCount be 2. Otherwise, let digitCount be 1.
|
||||||
size_t digit_count = 1;
|
size_t digit_count = 1;
|
||||||
|
|
||||||
|
@ -1373,15 +1373,15 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// g. Else if templateRemainder starts with "$<", then
|
// g. Else if templateRemainder starts with "$<", then
|
||||||
else if (template_remainder.starts_with(u"$<")) {
|
else if (template_remainder.starts_with(u"$<"sv)) {
|
||||||
// i. Let gtPos be StringIndexOf(templateRemainder, ">", 0).
|
// i. Let gtPos be StringIndexOf(templateRemainder, ">", 0).
|
||||||
// NOTE: We can actually start at index 2 because we know the string starts with "$<".
|
// NOTE: We can actually start at index 2 because we know the string starts with "$<".
|
||||||
auto greater_than_position = string_index_of(template_remainder, u">", 2);
|
auto greater_than_position = string_index_of(template_remainder, u">"sv, 2);
|
||||||
|
|
||||||
// ii. If gtPos = -1 or namedCaptures is undefined, then
|
// ii. If gtPos = -1 or namedCaptures is undefined, then
|
||||||
if (!greater_than_position.has_value() || named_captures.is_undefined()) {
|
if (!greater_than_position.has_value() || named_captures.is_undefined()) {
|
||||||
// 1. Let ref be "$<".
|
// 1. Let ref be "$<".
|
||||||
ref = u"$<";
|
ref = u"$<"sv;
|
||||||
|
|
||||||
// 2. Let refReplacement be ref.
|
// 2. Let refReplacement be ref.
|
||||||
ref_replacement = ref;
|
ref_replacement = ref;
|
||||||
|
@ -1427,7 +1427,7 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
|
||||||
auto ref_length = ref.length_in_code_units();
|
auto ref_length = ref.length_in_code_units();
|
||||||
|
|
||||||
// k. Set result to the string-concatenation of result and refReplacement.
|
// k. Set result to the string-concatenation of result and refReplacement.
|
||||||
result.append(ref_replacement.data(), ref_replacement.length_in_code_points());
|
result.append(ref_replacement.span().data(), ref_replacement.length_in_code_units());
|
||||||
|
|
||||||
// j. Set templateRemainder to the substring of templateRemainder from refLength.
|
// j. Set templateRemainder to the substring of templateRemainder from refLength.
|
||||||
// NOTE: We do this step last because refReplacement may point to templateRemainder.
|
// NOTE: We do this step last because refReplacement may point to templateRemainder.
|
||||||
|
|
|
@ -44,7 +44,7 @@ NonnullRefPtr<Utf16StringImpl> Utf16StringImpl::create(Utf16View const& view)
|
||||||
{
|
{
|
||||||
Utf16Data string;
|
Utf16Data string;
|
||||||
string.ensure_capacity(view.length_in_code_units());
|
string.ensure_capacity(view.length_in_code_units());
|
||||||
string.unchecked_append(view.data(), view.length_in_code_units());
|
string.unchecked_append(view.span().data(), view.length_in_code_units());
|
||||||
|
|
||||||
auto impl = create(move(string));
|
auto impl = create(move(string));
|
||||||
if (auto length_in_code_points = view.length_in_code_points_if_known(); length_in_code_points.has_value())
|
if (auto length_in_code_points = view.length_in_code_points_if_known(); length_in_code_points.has_value())
|
||||||
|
|
|
@ -48,7 +48,7 @@ private:
|
||||||
mutable bool m_has_hash { false };
|
mutable bool m_has_hash { false };
|
||||||
mutable u32 m_hash { 0 };
|
mutable u32 m_hash { 0 };
|
||||||
Utf16Data m_string;
|
Utf16Data m_string;
|
||||||
Utf16View m_cached_view { m_string.span() };
|
Utf16View m_cached_view { m_string };
|
||||||
};
|
};
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -147,9 +147,8 @@ public:
|
||||||
return Vector<RegexStringView> { view };
|
return Vector<RegexStringView> { view };
|
||||||
|
|
||||||
Vector<RegexStringView> views;
|
Vector<RegexStringView> views;
|
||||||
u16 newline = '\n';
|
|
||||||
while (!view.is_empty()) {
|
while (!view.is_empty()) {
|
||||||
auto position = AK::memmem_optional(view.data(), view.length_in_code_units() * sizeof(u16), &newline, sizeof(u16));
|
auto position = view.find_code_unit_offset(u'\n');
|
||||||
if (!position.has_value())
|
if (!position.has_value())
|
||||||
break;
|
break;
|
||||||
auto offset = position.value() / sizeof(u16);
|
auto offset = position.value() / sizeof(u16);
|
||||||
|
|
|
@ -159,8 +159,7 @@ String icu_string_to_string(icu::UnicodeString const& string)
|
||||||
|
|
||||||
String icu_string_to_string(UChar const* string, i32 length)
|
String icu_string_to_string(UChar const* string, i32 length)
|
||||||
{
|
{
|
||||||
ReadonlySpan<u16> view { reinterpret_cast<u16 const*>(string), static_cast<size_t>(length) };
|
return MUST(Utf16View { string, static_cast<size_t>(length) }.to_utf8());
|
||||||
return MUST(Utf16View { view }.to_utf8());
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -75,7 +75,7 @@ public:
|
||||||
|
|
||||||
virtual void set_segmented_text(Utf16View const& text) override
|
virtual void set_segmented_text(Utf16View const& text) override
|
||||||
{
|
{
|
||||||
m_segmented_text = icu::UnicodeString { text.data(), static_cast<i32>(text.length_in_code_units()) };
|
m_segmented_text = icu::UnicodeString { text.span().data(), static_cast<i32>(text.length_in_code_units()) };
|
||||||
m_segmenter->setText(m_segmented_text.get<icu::UnicodeString>());
|
m_segmenter->setText(m_segmented_text.get<icu::UnicodeString>());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -86,11 +86,12 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
|
||||||
auto before_data = utf16_view.substring_view(0, offset);
|
auto before_data = utf16_view.substring_view(0, offset);
|
||||||
auto inserted_data_result = MUST(AK::utf8_to_utf16(data));
|
auto inserted_data_result = MUST(AK::utf8_to_utf16(data));
|
||||||
auto after_data = utf16_view.substring_view(offset + count);
|
auto after_data = utf16_view.substring_view(offset + count);
|
||||||
|
|
||||||
Utf16Data full_data;
|
Utf16Data full_data;
|
||||||
full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units());
|
full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data_result.data.size() + after_data.length_in_code_units());
|
||||||
full_data.append(before_data.data(), before_data.length_in_code_units());
|
full_data.append(before_data.span().data(), before_data.length_in_code_units());
|
||||||
full_data.extend(inserted_data_result.data);
|
full_data.extend(inserted_data_result.data);
|
||||||
full_data.append(after_data.data(), after_data.length_in_code_units());
|
full_data.append(after_data.span().data(), after_data.length_in_code_units());
|
||||||
Utf16View full_view { full_data };
|
Utf16View full_view { full_data };
|
||||||
|
|
||||||
bool characters_are_the_same = utf16_view == full_view;
|
bool characters_are_the_same = utf16_view == full_view;
|
||||||
|
|
|
@ -106,7 +106,7 @@ WebIDL::ExceptionOr<FileReader::Result> FileReader::blob_package_data(JS::Realm&
|
||||||
return JS::ArrayBuffer::create(realm, move(bytes));
|
return JS::ArrayBuffer::create(realm, move(bytes));
|
||||||
case Type::BinaryString:
|
case Type::BinaryString:
|
||||||
// Return bytes as a binary string, in which every byte is represented by a code unit of equal value [0..255].
|
// Return bytes as a binary string, in which every byte is represented by a code unit of equal value [0..255].
|
||||||
Vector<u16> builder;
|
Utf16Data builder;
|
||||||
builder.ensure_capacity(bytes.size());
|
builder.ensure_capacity(bytes.size());
|
||||||
for (auto byte : bytes.bytes())
|
for (auto byte : bytes.bytes())
|
||||||
builder.unchecked_append(byte);
|
builder.unchecked_append(byte);
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
|
* Copyright (c) 2021-2025, Tim Flynn <trflynn89@ladybird.org>
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: BSD-2-Clause
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
*/
|
*/
|
||||||
|
@ -60,8 +60,7 @@ TEST_CASE(encode_utf8)
|
||||||
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No)), utf8_string);
|
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No)), utf8_string);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
auto encoded = Array { (u16)0xd83d };
|
Utf16View view { u"\xd83d"sv };
|
||||||
Utf16View view { encoded };
|
|
||||||
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), "\xed\xa0\xbd"sv);
|
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), "\xed\xa0\xbd"sv);
|
||||||
EXPECT(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No).is_error());
|
EXPECT(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No).is_error());
|
||||||
}
|
}
|
||||||
|
@ -69,11 +68,8 @@ TEST_CASE(encode_utf8)
|
||||||
|
|
||||||
TEST_CASE(decode_utf16)
|
TEST_CASE(decode_utf16)
|
||||||
{
|
{
|
||||||
// Same string as the decode_utf8 test.
|
Utf16View view { u"Привет, мир! 😀 γειά σου κόσμος こんにちは世界"sv };
|
||||||
auto encoded = Array { (u16)0x041f, 0x0440, 0x0438, 0x0432, 0x0435, 0x0442, 0x002c, 0x0020, 0x043c, 0x0438, 0x0440, 0x0021, 0x0020, 0xd83d, 0xde00, 0x0020, 0x03b3, 0x03b5, 0x03b9, 0x03ac, 0x0020, 0x03c3, 0x03bf, 0x03c5, 0x0020, 0x03ba, 0x03cc, 0x03c3, 0x03bc, 0x03bf, 0x03c2, 0x0020, 0x3053, 0x3093, 0x306b, 0x3061, 0x306f, 0x4e16, 0x754c };
|
EXPECT_EQ(view.length_in_code_units(), 39uz);
|
||||||
|
|
||||||
Utf16View view { encoded };
|
|
||||||
EXPECT_EQ(encoded.size(), view.length_in_code_units());
|
|
||||||
|
|
||||||
size_t valid_code_units = 0;
|
size_t valid_code_units = 0;
|
||||||
EXPECT(view.validate(valid_code_units));
|
EXPECT(view.validate(valid_code_units));
|
||||||
|
@ -113,18 +109,18 @@ TEST_CASE(null_view)
|
||||||
TEST_CASE(utf16_literal)
|
TEST_CASE(utf16_literal)
|
||||||
{
|
{
|
||||||
{
|
{
|
||||||
Utf16View view { u"" };
|
Utf16View view { u""sv };
|
||||||
EXPECT(view.validate());
|
EXPECT(view.validate());
|
||||||
EXPECT_EQ(view.length_in_code_units(), 0u);
|
EXPECT_EQ(view.length_in_code_units(), 0u);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
Utf16View view { u"a" };
|
Utf16View view { u"a"sv };
|
||||||
EXPECT(view.validate());
|
EXPECT(view.validate());
|
||||||
EXPECT_EQ(view.length_in_code_units(), 1u);
|
EXPECT_EQ(view.length_in_code_units(), 1u);
|
||||||
EXPECT_EQ(view.code_unit_at(0), 0x61u);
|
EXPECT_EQ(view.code_unit_at(0), 0x61u);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
Utf16View view { u"abc" };
|
Utf16View view { u"abc"sv };
|
||||||
EXPECT(view.validate());
|
EXPECT(view.validate());
|
||||||
EXPECT_EQ(view.length_in_code_units(), 3u);
|
EXPECT_EQ(view.length_in_code_units(), 3u);
|
||||||
EXPECT_EQ(view.code_unit_at(0), 0x61u);
|
EXPECT_EQ(view.code_unit_at(0), 0x61u);
|
||||||
|
@ -132,7 +128,7 @@ TEST_CASE(utf16_literal)
|
||||||
EXPECT_EQ(view.code_unit_at(2), 0x63u);
|
EXPECT_EQ(view.code_unit_at(2), 0x63u);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
Utf16View view { u"🙃" };
|
Utf16View view { u"🙃"sv };
|
||||||
EXPECT(view.validate());
|
EXPECT(view.validate());
|
||||||
EXPECT_EQ(view.length_in_code_units(), 2u);
|
EXPECT_EQ(view.length_in_code_units(), 2u);
|
||||||
EXPECT_EQ(view.code_unit_at(0), 0xd83du);
|
EXPECT_EQ(view.code_unit_at(0), 0xd83du);
|
||||||
|
@ -190,14 +186,14 @@ TEST_CASE(validate_invalid_utf16)
|
||||||
Utf16View invalid;
|
Utf16View invalid;
|
||||||
{
|
{
|
||||||
// Lonely high surrogate.
|
// Lonely high surrogate.
|
||||||
invalid = u"\xd800";
|
invalid = u"\xd800"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units));
|
||||||
EXPECT_EQ(valid_code_units, 0uz);
|
EXPECT_EQ(valid_code_units, 0uz);
|
||||||
|
|
||||||
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
||||||
EXPECT_EQ(valid_code_units, 1uz);
|
EXPECT_EQ(valid_code_units, 1uz);
|
||||||
|
|
||||||
invalid = u"\xdbff";
|
invalid = u"\xdbff"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units));
|
||||||
EXPECT_EQ(valid_code_units, 0uz);
|
EXPECT_EQ(valid_code_units, 0uz);
|
||||||
|
|
||||||
|
@ -206,14 +202,14 @@ TEST_CASE(validate_invalid_utf16)
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// Lonely low surrogate.
|
// Lonely low surrogate.
|
||||||
invalid = u"\xdc00";
|
invalid = u"\xdc00"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units));
|
||||||
EXPECT_EQ(valid_code_units, 0uz);
|
EXPECT_EQ(valid_code_units, 0uz);
|
||||||
|
|
||||||
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
||||||
EXPECT_EQ(valid_code_units, 1uz);
|
EXPECT_EQ(valid_code_units, 1uz);
|
||||||
|
|
||||||
invalid = u"\xdfff";
|
invalid = u"\xdfff"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units));
|
||||||
EXPECT_EQ(valid_code_units, 0uz);
|
EXPECT_EQ(valid_code_units, 0uz);
|
||||||
|
|
||||||
|
@ -222,14 +218,14 @@ TEST_CASE(validate_invalid_utf16)
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// High surrogate followed by non-surrogate.
|
// High surrogate followed by non-surrogate.
|
||||||
invalid = u"\xd800\x0000";
|
invalid = u"\xd800\x0000"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units));
|
||||||
EXPECT_EQ(valid_code_units, 0uz);
|
EXPECT_EQ(valid_code_units, 0uz);
|
||||||
|
|
||||||
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
||||||
EXPECT_EQ(valid_code_units, 2uz);
|
EXPECT_EQ(valid_code_units, 2uz);
|
||||||
|
|
||||||
invalid = u"\xd800\xe000";
|
invalid = u"\xd800\xe000"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units));
|
||||||
EXPECT_EQ(valid_code_units, 0uz);
|
EXPECT_EQ(valid_code_units, 0uz);
|
||||||
|
|
||||||
|
@ -238,14 +234,14 @@ TEST_CASE(validate_invalid_utf16)
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// High surrogate followed by high surrogate.
|
// High surrogate followed by high surrogate.
|
||||||
invalid = u"\xd800\xd800";
|
invalid = u"\xd800\xd800"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units));
|
||||||
EXPECT_EQ(valid_code_units, 0uz);
|
EXPECT_EQ(valid_code_units, 0uz);
|
||||||
|
|
||||||
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
||||||
EXPECT_EQ(valid_code_units, 2uz);
|
EXPECT_EQ(valid_code_units, 2uz);
|
||||||
|
|
||||||
invalid = u"\xd800\xdbff";
|
invalid = u"\xd800\xdbff"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units));
|
||||||
EXPECT_EQ(valid_code_units, 0uz);
|
EXPECT_EQ(valid_code_units, 0uz);
|
||||||
|
|
||||||
|
@ -254,14 +250,14 @@ TEST_CASE(validate_invalid_utf16)
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// Valid UTF-16 followed by invalid code units.
|
// Valid UTF-16 followed by invalid code units.
|
||||||
invalid = u"\x0041\x0041\xd800";
|
invalid = u"\x0041\x0041\xd800"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units));
|
||||||
EXPECT_EQ(valid_code_units, 2uz);
|
EXPECT_EQ(valid_code_units, 2uz);
|
||||||
|
|
||||||
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
||||||
EXPECT_EQ(valid_code_units, 3uz);
|
EXPECT_EQ(valid_code_units, 3uz);
|
||||||
|
|
||||||
invalid = u"\x0041\x0041\xd800";
|
invalid = u"\x0041\x0041\xd800"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units));
|
||||||
EXPECT_EQ(valid_code_units, 2uz);
|
EXPECT_EQ(valid_code_units, 2uz);
|
||||||
|
|
||||||
|
@ -274,10 +270,8 @@ TEST_CASE(decode_invalid_utf16)
|
||||||
{
|
{
|
||||||
{
|
{
|
||||||
// Lonely high surrogate.
|
// Lonely high surrogate.
|
||||||
auto invalid = Array { (u16)0x41, 0x42, 0xd800 };
|
Utf16View view { u"AB\xd800"sv };
|
||||||
|
EXPECT_EQ(view.length_in_code_units(), 3uz);
|
||||||
Utf16View view { invalid };
|
|
||||||
EXPECT_EQ(invalid.size(), view.length_in_code_units());
|
|
||||||
|
|
||||||
auto expected = Array { (u32)0x41, 0x42, 0xfffd };
|
auto expected = Array { (u32)0x41, 0x42, 0xfffd };
|
||||||
EXPECT_EQ(expected.size(), view.length_in_code_points());
|
EXPECT_EQ(expected.size(), view.length_in_code_points());
|
||||||
|
@ -290,10 +284,8 @@ TEST_CASE(decode_invalid_utf16)
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// Lonely low surrogate.
|
// Lonely low surrogate.
|
||||||
auto invalid = Array { (u16)0x41, 0x42, 0xdc00 };
|
Utf16View view { u"AB\xdc00"sv };
|
||||||
|
EXPECT_EQ(view.length_in_code_units(), 3uz);
|
||||||
Utf16View view { invalid };
|
|
||||||
EXPECT_EQ(invalid.size(), view.length_in_code_units());
|
|
||||||
|
|
||||||
auto expected = Array { (u32)0x41, 0x42, 0xfffd };
|
auto expected = Array { (u32)0x41, 0x42, 0xfffd };
|
||||||
EXPECT_EQ(expected.size(), view.length_in_code_points());
|
EXPECT_EQ(expected.size(), view.length_in_code_points());
|
||||||
|
@ -306,10 +298,8 @@ TEST_CASE(decode_invalid_utf16)
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// High surrogate followed by non-surrogate.
|
// High surrogate followed by non-surrogate.
|
||||||
auto invalid = Array { (u16)0x41, 0x42, 0xd800, 0 };
|
Utf16View view { u"AB\xd800\x0000"sv };
|
||||||
|
EXPECT_EQ(view.length_in_code_units(), 4uz);
|
||||||
Utf16View view { invalid };
|
|
||||||
EXPECT_EQ(invalid.size(), view.length_in_code_units());
|
|
||||||
|
|
||||||
auto expected = Array { (u32)0x41, 0x42, 0xfffd, 0 };
|
auto expected = Array { (u32)0x41, 0x42, 0xfffd, 0 };
|
||||||
EXPECT_EQ(expected.size(), view.length_in_code_points());
|
EXPECT_EQ(expected.size(), view.length_in_code_points());
|
||||||
|
@ -322,10 +312,8 @@ TEST_CASE(decode_invalid_utf16)
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// High surrogate followed by high surrogate.
|
// High surrogate followed by high surrogate.
|
||||||
auto invalid = Array { (u16)0x41, 0x42, 0xd800, 0xd800 };
|
Utf16View view { u"AB\xd800\xd800"sv };
|
||||||
|
EXPECT_EQ(view.length_in_code_units(), 4uz);
|
||||||
Utf16View view { invalid };
|
|
||||||
EXPECT_EQ(invalid.size(), view.length_in_code_units());
|
|
||||||
|
|
||||||
auto expected = Array { (u32)0x41, 0x42, 0xfffd, 0xfffd };
|
auto expected = Array { (u32)0x41, 0x42, 0xfffd, 0xfffd };
|
||||||
EXPECT_EQ(expected.size(), view.length_in_code_points());
|
EXPECT_EQ(expected.size(), view.length_in_code_points());
|
||||||
|
@ -341,13 +329,13 @@ TEST_CASE(decode_invalid_utf16)
|
||||||
TEST_CASE(is_ascii)
|
TEST_CASE(is_ascii)
|
||||||
{
|
{
|
||||||
EXPECT(Utf16View {}.is_ascii());
|
EXPECT(Utf16View {}.is_ascii());
|
||||||
EXPECT(Utf16View { u"a" }.is_ascii());
|
EXPECT(u"a"sv.is_ascii());
|
||||||
EXPECT(Utf16View { u"foo" }.is_ascii());
|
EXPECT(u"foo"sv.is_ascii());
|
||||||
EXPECT(Utf16View { u"foo\t\n\rbar\v\b123" }.is_ascii());
|
EXPECT(u"foo\t\n\rbar\v\b123"sv.is_ascii());
|
||||||
|
|
||||||
EXPECT(!Utf16View { u"😀" }.is_ascii());
|
EXPECT(!u"😀"sv.is_ascii());
|
||||||
EXPECT(!Utf16View { u"foo 😀" }.is_ascii());
|
EXPECT(!u"foo 😀"sv.is_ascii());
|
||||||
EXPECT(!Utf16View { u"😀 foo" }.is_ascii());
|
EXPECT(!u"😀 foo"sv.is_ascii());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE(equals_ignoring_case)
|
TEST_CASE(equals_ignoring_case)
|
||||||
|
@ -387,28 +375,28 @@ TEST_CASE(substring_view)
|
||||||
|
|
||||||
TEST_CASE(starts_with)
|
TEST_CASE(starts_with)
|
||||||
{
|
{
|
||||||
EXPECT(Utf16View {}.starts_with(u""));
|
EXPECT(Utf16View {}.starts_with(u""sv));
|
||||||
EXPECT(!Utf16View {}.starts_with(u" "));
|
EXPECT(!Utf16View {}.starts_with(u" "sv));
|
||||||
|
|
||||||
EXPECT(Utf16View { u"a" }.starts_with(u""));
|
EXPECT(u"a"sv.starts_with(u""sv));
|
||||||
EXPECT(Utf16View { u"a" }.starts_with(u"a"));
|
EXPECT(u"a"sv.starts_with(u"a"sv));
|
||||||
EXPECT(!Utf16View { u"a" }.starts_with(u"b"));
|
EXPECT(!u"a"sv.starts_with(u"b"sv));
|
||||||
EXPECT(!Utf16View { u"a" }.starts_with(u"ab"));
|
EXPECT(!u"a"sv.starts_with(u"ab"sv));
|
||||||
|
|
||||||
EXPECT(Utf16View { u"abc" }.starts_with(u""));
|
EXPECT(u"abc"sv.starts_with(u""sv));
|
||||||
EXPECT(Utf16View { u"abc" }.starts_with(u"a"));
|
EXPECT(u"abc"sv.starts_with(u"a"sv));
|
||||||
EXPECT(Utf16View { u"abc" }.starts_with(u"ab"));
|
EXPECT(u"abc"sv.starts_with(u"ab"sv));
|
||||||
EXPECT(Utf16View { u"abc" }.starts_with(u"abc"));
|
EXPECT(u"abc"sv.starts_with(u"abc"sv));
|
||||||
EXPECT(!Utf16View { u"abc" }.starts_with(u"b"));
|
EXPECT(!u"abc"sv.starts_with(u"b"sv));
|
||||||
EXPECT(!Utf16View { u"abc" }.starts_with(u"bc"));
|
EXPECT(!u"abc"sv.starts_with(u"bc"sv));
|
||||||
|
|
||||||
auto emoji = Utf16View { u"😀🙃" };
|
auto emoji = u"😀🙃"sv;
|
||||||
|
|
||||||
EXPECT(emoji.starts_with(u""));
|
EXPECT(emoji.starts_with(u""sv));
|
||||||
EXPECT(emoji.starts_with(u"😀"));
|
EXPECT(emoji.starts_with(u"😀"sv));
|
||||||
EXPECT(emoji.starts_with(u"😀🙃"));
|
EXPECT(emoji.starts_with(u"😀🙃"sv));
|
||||||
EXPECT(!emoji.starts_with(u"a"));
|
EXPECT(!emoji.starts_with(u"a"sv));
|
||||||
EXPECT(!emoji.starts_with(u"🙃"));
|
EXPECT(!emoji.starts_with(u"🙃"sv));
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE(find_code_unit_offset)
|
TEST_CASE(find_code_unit_offset)
|
||||||
|
@ -416,16 +404,16 @@ TEST_CASE(find_code_unit_offset)
|
||||||
auto conversion_result = MUST(AK::utf8_to_utf16("😀foo😀bar"sv));
|
auto conversion_result = MUST(AK::utf8_to_utf16("😀foo😀bar"sv));
|
||||||
Utf16View const view { conversion_result };
|
Utf16View const view { conversion_result };
|
||||||
|
|
||||||
EXPECT_EQ(0u, view.find_code_unit_offset(u"").value());
|
EXPECT_EQ(0u, view.find_code_unit_offset(u""sv).value());
|
||||||
EXPECT_EQ(4u, view.find_code_unit_offset(u"", 4).value());
|
EXPECT_EQ(4u, view.find_code_unit_offset(u""sv, 4).value());
|
||||||
EXPECT(!view.find_code_unit_offset(u"", 16).has_value());
|
EXPECT(!view.find_code_unit_offset(u""sv, 16).has_value());
|
||||||
|
|
||||||
EXPECT_EQ(0u, view.find_code_unit_offset(u"😀").value());
|
EXPECT_EQ(0u, view.find_code_unit_offset(u"😀"sv).value());
|
||||||
EXPECT_EQ(5u, view.find_code_unit_offset(u"😀", 1).value());
|
EXPECT_EQ(5u, view.find_code_unit_offset(u"😀"sv, 1).value());
|
||||||
EXPECT_EQ(2u, view.find_code_unit_offset(u"foo").value());
|
EXPECT_EQ(2u, view.find_code_unit_offset(u"foo"sv).value());
|
||||||
EXPECT_EQ(7u, view.find_code_unit_offset(u"bar").value());
|
EXPECT_EQ(7u, view.find_code_unit_offset(u"bar"sv).value());
|
||||||
|
|
||||||
EXPECT(!view.find_code_unit_offset(u"baz").has_value());
|
EXPECT(!view.find_code_unit_offset(u"baz"sv).has_value());
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE(find_code_unit_offset_ignoring_case)
|
TEST_CASE(find_code_unit_offset_ignoring_case)
|
||||||
|
@ -433,13 +421,13 @@ TEST_CASE(find_code_unit_offset_ignoring_case)
|
||||||
auto conversion_result = MUST(AK::utf8_to_utf16("😀Foo😀Bar"sv));
|
auto conversion_result = MUST(AK::utf8_to_utf16("😀Foo😀Bar"sv));
|
||||||
Utf16View const view { conversion_result };
|
Utf16View const view { conversion_result };
|
||||||
|
|
||||||
EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u"").value());
|
EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u""sv).value());
|
||||||
EXPECT_EQ(4u, view.find_code_unit_offset_ignoring_case(u"", 4).value());
|
EXPECT_EQ(4u, view.find_code_unit_offset_ignoring_case(u""sv, 4).value());
|
||||||
EXPECT(!view.find_code_unit_offset_ignoring_case(u"", 16).has_value());
|
EXPECT(!view.find_code_unit_offset_ignoring_case(u""sv, 16).has_value());
|
||||||
|
|
||||||
EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u"😀").value());
|
EXPECT_EQ(0u, view.find_code_unit_offset_ignoring_case(u"😀"sv).value());
|
||||||
EXPECT_EQ(5u, view.find_code_unit_offset_ignoring_case(u"😀", 1).value());
|
EXPECT_EQ(5u, view.find_code_unit_offset_ignoring_case(u"😀"sv, 1).value());
|
||||||
EXPECT_EQ(2u, view.find_code_unit_offset_ignoring_case(u"foO").value());
|
EXPECT_EQ(2u, view.find_code_unit_offset_ignoring_case(u"foO"sv).value());
|
||||||
EXPECT_EQ(7u, view.find_code_unit_offset_ignoring_case(u"baR").value());
|
EXPECT_EQ(7u, view.find_code_unit_offset_ignoring_case(u"baR"sv).value());
|
||||||
EXPECT(!view.find_code_unit_offset_ignoring_case(u"baz").has_value());
|
EXPECT(!view.find_code_unit_offset_ignoring_case(u"baz"sv).has_value());
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue