mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-07-31 13:19:05 +00:00
AK+LibJS: Extract some UTF-16 helpers for use in an outside class
An upcoming Utf16String will need access to these helpers. Let's make them publicly available.
This commit is contained in:
parent
b6dc5050d2
commit
66006d3812
Notes:
github-actions[bot]
2025-07-03 13:54:12 +00:00
Author: https://github.com/trflynn89
Commit: 66006d3812
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5228
Reviewed-by: https://github.com/ADKaster ✅
Reviewed-by: https://github.com/shannonbooth
10 changed files with 121 additions and 86 deletions
|
@ -10,7 +10,7 @@
|
||||||
#include <AK/GenericLexer.h>
|
#include <AK/GenericLexer.h>
|
||||||
#include <AK/ScopeGuard.h>
|
#include <AK/ScopeGuard.h>
|
||||||
#include <AK/StringBuilder.h>
|
#include <AK/StringBuilder.h>
|
||||||
#include <AK/Utf16View.h>
|
#include <AK/UnicodeUtils.h>
|
||||||
|
|
||||||
namespace AK {
|
namespace AK {
|
||||||
|
|
||||||
|
@ -266,7 +266,7 @@ auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pair
|
||||||
auto high_surrogate = decode_one_surrogate();
|
auto high_surrogate = decode_one_surrogate();
|
||||||
if (!high_surrogate.has_value())
|
if (!high_surrogate.has_value())
|
||||||
return UnicodeEscapeError::MalformedUnicodeEscape;
|
return UnicodeEscapeError::MalformedUnicodeEscape;
|
||||||
if (!Utf16View::is_high_surrogate(*high_surrogate))
|
if (!UnicodeUtils::is_utf16_high_surrogate(*high_surrogate))
|
||||||
return *high_surrogate;
|
return *high_surrogate;
|
||||||
if (!combine_surrogate_pairs || !consume_specific("\\u"sv))
|
if (!combine_surrogate_pairs || !consume_specific("\\u"sv))
|
||||||
return *high_surrogate;
|
return *high_surrogate;
|
||||||
|
@ -274,8 +274,8 @@ auto GenericLexer::decode_single_or_paired_surrogate(bool combine_surrogate_pair
|
||||||
auto low_surrogate = decode_one_surrogate();
|
auto low_surrogate = decode_one_surrogate();
|
||||||
if (!low_surrogate.has_value())
|
if (!low_surrogate.has_value())
|
||||||
return UnicodeEscapeError::MalformedUnicodeEscape;
|
return UnicodeEscapeError::MalformedUnicodeEscape;
|
||||||
if (Utf16View::is_low_surrogate(*low_surrogate))
|
if (UnicodeUtils::is_utf16_low_surrogate(*low_surrogate))
|
||||||
return Utf16View::decode_surrogate_pair(*high_surrogate, *low_surrogate);
|
return UnicodeUtils::decode_utf16_surrogate_pair(*high_surrogate, *low_surrogate);
|
||||||
|
|
||||||
retreat(6);
|
retreat(6);
|
||||||
return *high_surrogate;
|
return *high_surrogate;
|
||||||
|
|
|
@ -283,7 +283,7 @@ ErrorOr<void> StringBuilder::try_append(Utf16View const& utf16_view)
|
||||||
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 12) & 0x0f) | 0xe0);
|
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 12) & 0x0f) | 0xe0);
|
||||||
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 6) & 0x3f) | 0x80);
|
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 6) & 0x3f) | 0x80);
|
||||||
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 0) & 0x3f) | 0x80);
|
uninitialized_data_pointer[bytes_just_written++] = (((code_unit >> 0) & 0x3f) | 0x80);
|
||||||
} while (first_invalid_code_unit < remaining_view.length_in_code_units() && Utf16View::is_low_surrogate(remaining_view.data()[first_invalid_code_unit]));
|
} while (first_invalid_code_unit < remaining_view.length_in_code_units() && UnicodeUtils::is_utf16_low_surrogate(remaining_view.data()[first_invalid_code_unit]));
|
||||||
|
|
||||||
// Code unit might no longer be invalid, retry on the remaining data.
|
// Code unit might no longer be invalid, retry on the remaining data.
|
||||||
m_buffer.set_size(m_buffer.size() + bytes_just_written);
|
m_buffer.set_size(m_buffer.size() + bytes_just_written);
|
||||||
|
|
|
@ -78,6 +78,82 @@ template<FallibleFunction<char> Callback>
|
||||||
return -1;
|
return -1;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
constexpr inline u16 HIGH_SURROGATE_MIN = 0xd800;
|
||||||
|
constexpr inline u16 HIGH_SURROGATE_MAX = 0xdbff;
|
||||||
|
constexpr inline u16 LOW_SURROGATE_MIN = 0xdc00;
|
||||||
|
constexpr inline u16 LOW_SURROGATE_MAX = 0xdfff;
|
||||||
|
constexpr inline u32 REPLACEMENT_CODE_POINT = 0xfffd;
|
||||||
|
constexpr inline u32 FIRST_SUPPLEMENTARY_PLANE_CODE_POINT = 0x10000;
|
||||||
|
|
||||||
|
[[nodiscard]] constexpr size_t code_unit_length_for_code_point(u32 code_point)
|
||||||
|
{
|
||||||
|
return code_point < FIRST_SUPPLEMENTARY_PLANE_CODE_POINT ? 1uz : 2uz;
|
||||||
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] constexpr bool is_utf16_high_surrogate(u16 code_unit)
|
||||||
|
{
|
||||||
|
return (code_unit >= HIGH_SURROGATE_MIN) && (code_unit <= HIGH_SURROGATE_MAX);
|
||||||
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] constexpr bool is_utf16_low_surrogate(u16 code_unit)
|
||||||
|
{
|
||||||
|
return (code_unit >= LOW_SURROGATE_MIN) && (code_unit <= LOW_SURROGATE_MAX);
|
||||||
|
}
|
||||||
|
|
||||||
|
[[nodiscard]] constexpr u32 decode_utf16_surrogate_pair(u16 high_surrogate, u16 low_surrogate)
|
||||||
|
{
|
||||||
|
VERIFY(is_utf16_high_surrogate(high_surrogate));
|
||||||
|
VERIFY(is_utf16_low_surrogate(low_surrogate));
|
||||||
|
|
||||||
|
return ((high_surrogate - HIGH_SURROGATE_MIN) << 10) + (low_surrogate - LOW_SURROGATE_MIN) + FIRST_SUPPLEMENTARY_PLANE_CODE_POINT;
|
||||||
|
}
|
||||||
|
|
||||||
|
template<typename Callback>
|
||||||
|
[[nodiscard]] constexpr size_t code_point_to_utf16(u32 code_point, Callback callback)
|
||||||
|
{
|
||||||
|
if (code_point < FIRST_SUPPLEMENTARY_PLANE_CODE_POINT) {
|
||||||
|
callback(static_cast<char16_t>(code_point));
|
||||||
|
return 1uz;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (code_point <= 0x10ffff) {
|
||||||
|
code_point -= FIRST_SUPPLEMENTARY_PLANE_CODE_POINT;
|
||||||
|
|
||||||
|
auto code_unit = static_cast<u16>(HIGH_SURROGATE_MIN | (code_point >> 10));
|
||||||
|
callback(static_cast<char16_t>(code_unit));
|
||||||
|
|
||||||
|
code_unit = static_cast<u16>(LOW_SURROGATE_MIN | (code_point & 0x3ff));
|
||||||
|
callback(static_cast<char16_t>(code_unit));
|
||||||
|
|
||||||
|
return 2uz;
|
||||||
|
}
|
||||||
|
|
||||||
|
VERIFY_NOT_REACHED();
|
||||||
|
}
|
||||||
|
|
||||||
|
template<FallibleFunction<char16_t> Callback>
|
||||||
|
constexpr ErrorOr<size_t> try_code_point_to_utf16(u32 code_point, Callback callback)
|
||||||
|
{
|
||||||
|
if (code_point < FIRST_SUPPLEMENTARY_PLANE_CODE_POINT) {
|
||||||
|
TRY(callback(static_cast<char16_t>(code_point)));
|
||||||
|
return 1uz;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (code_point <= 0x10ffff) {
|
||||||
|
code_point -= FIRST_SUPPLEMENTARY_PLANE_CODE_POINT;
|
||||||
|
|
||||||
|
auto code_unit = static_cast<u16>(HIGH_SURROGATE_MIN | (code_point >> 10));
|
||||||
|
TRY(callback(static_cast<char16_t>(code_unit)));
|
||||||
|
|
||||||
|
code_unit = static_cast<u16>(LOW_SURROGATE_MIN | (code_point & 0x3ff));
|
||||||
|
TRY(callback(static_cast<char16_t>(code_unit)));
|
||||||
|
|
||||||
|
return 2uz;
|
||||||
|
}
|
||||||
|
|
||||||
|
VERIFY_NOT_REACHED();
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Compute the maximum number of UTF-8 bytes needed to store a given UTF-16 string, accounting for unmatched UTF-16 surrogates.
|
* Compute the maximum number of UTF-8 bytes needed to store a given UTF-16 string, accounting for unmatched UTF-16 surrogates.
|
||||||
* This function will overcount by at most 33%; 2 bytes for every valid UTF-16 codepoint between U+100000 and U+10FFFF.
|
* This function will overcount by at most 33%; 2 bytes for every valid UTF-16 codepoint between U+100000 and U+10FFFF.
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
|
* Copyright (c) 2021-2025, Tim Flynn <trflynn89@ladybird.org>
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: BSD-2-Clause
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
*/
|
*/
|
||||||
|
@ -16,13 +16,6 @@
|
||||||
|
|
||||||
namespace AK {
|
namespace AK {
|
||||||
|
|
||||||
static constexpr u16 high_surrogate_min = 0xd800;
|
|
||||||
static constexpr u16 high_surrogate_max = 0xdbff;
|
|
||||||
static constexpr u16 low_surrogate_min = 0xdc00;
|
|
||||||
static constexpr u16 low_surrogate_max = 0xdfff;
|
|
||||||
static constexpr u32 replacement_code_point = 0xfffd;
|
|
||||||
static constexpr u32 first_supplementary_plane_code_point = 0x10000;
|
|
||||||
|
|
||||||
static constexpr u16 host_code_unit(u16 code_unit, Endianness endianness)
|
static constexpr u16 host_code_unit(u16 code_unit, Endianness endianness)
|
||||||
{
|
{
|
||||||
switch (endianness) {
|
switch (endianness) {
|
||||||
|
@ -44,7 +37,11 @@ static ErrorOr<Utf16ConversionResult> to_utf16_slow(UtfViewType const& view, End
|
||||||
|
|
||||||
size_t code_point_count = 0;
|
size_t code_point_count = 0;
|
||||||
for (auto code_point : view) {
|
for (auto code_point : view) {
|
||||||
TRY(code_point_to_utf16(utf16_data, code_point, endianness));
|
TRY(UnicodeUtils::try_code_point_to_utf16(code_point, [&](auto code_unit) -> ErrorOr<void> {
|
||||||
|
TRY(utf16_data.try_append(host_code_unit(code_unit, endianness)));
|
||||||
|
return {};
|
||||||
|
}));
|
||||||
|
|
||||||
code_point_count++;
|
code_point_count++;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -116,48 +113,11 @@ ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const& utf32_view, Endia
|
||||||
return Utf16ConversionResult { utf16_data, length };
|
return Utf16ConversionResult { utf16_data, length };
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<void> code_point_to_utf16(Utf16Data& string, u32 code_point, Endianness endianness)
|
|
||||||
{
|
|
||||||
VERIFY(is_unicode(code_point));
|
|
||||||
|
|
||||||
if (code_point < first_supplementary_plane_code_point) {
|
|
||||||
TRY(string.try_append(host_code_unit(static_cast<u16>(code_point), endianness)));
|
|
||||||
} else {
|
|
||||||
code_point -= first_supplementary_plane_code_point;
|
|
||||||
|
|
||||||
auto code_unit = static_cast<u16>(high_surrogate_min | (code_point >> 10));
|
|
||||||
TRY(string.try_append(host_code_unit(code_unit, endianness)));
|
|
||||||
|
|
||||||
code_unit = static_cast<u16>(low_surrogate_min | (code_point & 0x3ff));
|
|
||||||
TRY(string.try_append(host_code_unit(code_unit, endianness)));
|
|
||||||
}
|
|
||||||
|
|
||||||
return {};
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t utf16_code_unit_length_from_utf8(StringView string)
|
size_t utf16_code_unit_length_from_utf8(StringView string)
|
||||||
{
|
{
|
||||||
return simdutf::utf16_length_from_utf8(string.characters_without_null_termination(), string.length());
|
return simdutf::utf16_length_from_utf8(string.characters_without_null_termination(), string.length());
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Utf16View::is_high_surrogate(u16 code_unit)
|
|
||||||
{
|
|
||||||
return (code_unit >= high_surrogate_min) && (code_unit <= high_surrogate_max);
|
|
||||||
}
|
|
||||||
|
|
||||||
bool Utf16View::is_low_surrogate(u16 code_unit)
|
|
||||||
{
|
|
||||||
return (code_unit >= low_surrogate_min) && (code_unit <= low_surrogate_max);
|
|
||||||
}
|
|
||||||
|
|
||||||
u32 Utf16View::decode_surrogate_pair(u16 high_surrogate, u16 low_surrogate)
|
|
||||||
{
|
|
||||||
VERIFY(is_high_surrogate(high_surrogate));
|
|
||||||
VERIFY(is_low_surrogate(low_surrogate));
|
|
||||||
|
|
||||||
return ((high_surrogate - high_surrogate_min) << 10) + (low_surrogate - low_surrogate_min) + first_supplementary_plane_code_point;
|
|
||||||
}
|
|
||||||
|
|
||||||
ErrorOr<ByteString> Utf16View::to_byte_string(AllowInvalidCodeUnits allow_invalid_code_units) const
|
ErrorOr<ByteString> Utf16View::to_byte_string(AllowInvalidCodeUnits allow_invalid_code_units) const
|
||||||
{
|
{
|
||||||
return TRY(to_utf8(allow_invalid_code_units)).to_byte_string();
|
return TRY(to_utf8(allow_invalid_code_units)).to_byte_string();
|
||||||
|
@ -191,16 +151,16 @@ u32 Utf16View::code_point_at(size_t index) const
|
||||||
VERIFY(index < length_in_code_units());
|
VERIFY(index < length_in_code_units());
|
||||||
|
|
||||||
u32 code_point = code_unit_at(index);
|
u32 code_point = code_unit_at(index);
|
||||||
if (!is_high_surrogate(code_point) && !is_low_surrogate(code_point))
|
if (!UnicodeUtils::is_utf16_high_surrogate(code_point) && !UnicodeUtils::is_utf16_low_surrogate(code_point))
|
||||||
return code_point;
|
return code_point;
|
||||||
if (is_low_surrogate(code_point) || (index + 1 == length_in_code_units()))
|
if (UnicodeUtils::is_utf16_low_surrogate(code_point) || (index + 1 == length_in_code_units()))
|
||||||
return code_point;
|
return code_point;
|
||||||
|
|
||||||
auto second = code_unit_at(index + 1);
|
auto second = code_unit_at(index + 1);
|
||||||
if (!is_low_surrogate(second))
|
if (!UnicodeUtils::is_utf16_low_surrogate(second))
|
||||||
return code_point;
|
return code_point;
|
||||||
|
|
||||||
return decode_surrogate_pair(code_point, second);
|
return UnicodeUtils::decode_utf16_surrogate_pair(code_point, second);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
|
size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
|
||||||
|
@ -418,28 +378,23 @@ u32 Utf16CodePointIterator::operator*() const
|
||||||
|
|
||||||
auto code_unit = host_code_unit(*m_ptr, Endianness::Host);
|
auto code_unit = host_code_unit(*m_ptr, Endianness::Host);
|
||||||
|
|
||||||
if (Utf16View::is_high_surrogate(code_unit)) {
|
if (UnicodeUtils::is_utf16_high_surrogate(code_unit)) {
|
||||||
if (m_remaining_code_units > 1) {
|
if (m_remaining_code_units > 1) {
|
||||||
auto next_code_unit = host_code_unit(*(m_ptr + 1), Endianness::Host);
|
auto next_code_unit = host_code_unit(*(m_ptr + 1), Endianness::Host);
|
||||||
|
|
||||||
if (Utf16View::is_low_surrogate(next_code_unit))
|
if (UnicodeUtils::is_utf16_low_surrogate(next_code_unit))
|
||||||
return Utf16View::decode_surrogate_pair(code_unit, next_code_unit);
|
return UnicodeUtils::decode_utf16_surrogate_pair(code_unit, next_code_unit);
|
||||||
}
|
}
|
||||||
|
|
||||||
return replacement_code_point;
|
return UnicodeUtils::REPLACEMENT_CODE_POINT;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (Utf16View::is_low_surrogate(code_unit))
|
if (UnicodeUtils::is_utf16_low_surrogate(code_unit))
|
||||||
return replacement_code_point;
|
return UnicodeUtils::REPLACEMENT_CODE_POINT;
|
||||||
|
|
||||||
return static_cast<u32>(code_unit);
|
return static_cast<u32>(code_unit);
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Utf16CodePointIterator::length_in_code_units() const
|
|
||||||
{
|
|
||||||
return *(*this) < first_supplementary_plane_code_point ? 1 : 2;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool validate_utf16_le(ReadonlyBytes bytes)
|
bool validate_utf16_le(ReadonlyBytes bytes)
|
||||||
{
|
{
|
||||||
return simdutf::validate_utf16le(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
|
return simdutf::validate_utf16le(reinterpret_cast<char16_t const*>(bytes.data()), bytes.size() / 2);
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (c) 2021-2024, Tim Flynn <trflynn89@serenityos.org>
|
* Copyright (c) 2021-2025, Tim Flynn <trflynn89@ladybird.org>
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: BSD-2-Clause
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
*/
|
*/
|
||||||
|
@ -15,6 +15,7 @@
|
||||||
#include <AK/Span.h>
|
#include <AK/Span.h>
|
||||||
#include <AK/String.h>
|
#include <AK/String.h>
|
||||||
#include <AK/Types.h>
|
#include <AK/Types.h>
|
||||||
|
#include <AK/UnicodeUtils.h>
|
||||||
#include <AK/Vector.h>
|
#include <AK/Vector.h>
|
||||||
|
|
||||||
namespace AK {
|
namespace AK {
|
||||||
|
@ -28,7 +29,6 @@ struct Utf16ConversionResult {
|
||||||
ErrorOr<Utf16ConversionResult> utf8_to_utf16(StringView, Endianness = Endianness::Host);
|
ErrorOr<Utf16ConversionResult> utf8_to_utf16(StringView, Endianness = Endianness::Host);
|
||||||
ErrorOr<Utf16ConversionResult> utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host);
|
ErrorOr<Utf16ConversionResult> utf8_to_utf16(Utf8View const&, Endianness = Endianness::Host);
|
||||||
ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host);
|
ErrorOr<Utf16ConversionResult> utf32_to_utf16(Utf32View const&, Endianness = Endianness::Host);
|
||||||
ErrorOr<void> code_point_to_utf16(Utf16Data&, u32, Endianness = Endianness::Host);
|
|
||||||
|
|
||||||
[[nodiscard]] bool validate_utf16_le(ReadonlyBytes);
|
[[nodiscard]] bool validate_utf16_le(ReadonlyBytes);
|
||||||
[[nodiscard]] bool validate_utf16_be(ReadonlyBytes);
|
[[nodiscard]] bool validate_utf16_be(ReadonlyBytes);
|
||||||
|
@ -52,7 +52,10 @@ public:
|
||||||
Utf16CodePointIterator& operator++();
|
Utf16CodePointIterator& operator++();
|
||||||
u32 operator*() const;
|
u32 operator*() const;
|
||||||
|
|
||||||
size_t length_in_code_units() const;
|
size_t length_in_code_units() const
|
||||||
|
{
|
||||||
|
return UnicodeUtils::code_unit_length_for_code_point(**this);
|
||||||
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
Utf16CodePointIterator(u16 const* ptr, size_t length)
|
Utf16CodePointIterator(u16 const* ptr, size_t length)
|
||||||
|
@ -69,10 +72,6 @@ class Utf16View {
|
||||||
public:
|
public:
|
||||||
using Iterator = Utf16CodePointIterator;
|
using Iterator = Utf16CodePointIterator;
|
||||||
|
|
||||||
static bool is_high_surrogate(u16);
|
|
||||||
static bool is_low_surrogate(u16);
|
|
||||||
static u32 decode_surrogate_pair(u16 high_surrogate, u16 low_surrogate);
|
|
||||||
|
|
||||||
Utf16View() = default;
|
Utf16View() = default;
|
||||||
~Utf16View() = default;
|
~Utf16View() = default;
|
||||||
|
|
||||||
|
|
|
@ -9,13 +9,14 @@
|
||||||
* SPDX-License-Identifier: BSD-2-Clause
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "Parser.h"
|
|
||||||
#include <AK/Array.h>
|
#include <AK/Array.h>
|
||||||
#include <AK/CharacterTypes.h>
|
#include <AK/CharacterTypes.h>
|
||||||
#include <AK/HashTable.h>
|
#include <AK/HashTable.h>
|
||||||
#include <AK/ScopeGuard.h>
|
#include <AK/ScopeGuard.h>
|
||||||
#include <AK/StdLibExtras.h>
|
#include <AK/StdLibExtras.h>
|
||||||
#include <AK/TemporaryChange.h>
|
#include <AK/TemporaryChange.h>
|
||||||
|
#include <AK/UnicodeUtils.h>
|
||||||
|
#include <LibJS/Parser.h>
|
||||||
#include <LibJS/Runtime/RegExpObject.h>
|
#include <LibJS/Runtime/RegExpObject.h>
|
||||||
#include <LibRegex/Regex.h>
|
#include <LibRegex/Regex.h>
|
||||||
|
|
||||||
|
@ -4601,7 +4602,7 @@ FlyString Parser::consume_string_value()
|
||||||
Utf8View view { value.bytes_as_string_view().substring_view(value.bytes().size() - 3) };
|
Utf8View view { value.bytes_as_string_view().substring_view(value.bytes().size() - 3) };
|
||||||
VERIFY(view.length() <= 3);
|
VERIFY(view.length() <= 3);
|
||||||
auto codepoint = *view.begin();
|
auto codepoint = *view.begin();
|
||||||
if (Utf16View::is_high_surrogate(codepoint)) {
|
if (AK::UnicodeUtils::is_utf16_high_surrogate(codepoint)) {
|
||||||
syntax_error("StringValue ending with unpaired high surrogate"_string);
|
syntax_error("StringValue ending with unpaired high surrogate"_string);
|
||||||
VERIFY(view.length() == 1);
|
VERIFY(view.length() == 1);
|
||||||
}
|
}
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
#include <AK/CharacterTypes.h>
|
#include <AK/CharacterTypes.h>
|
||||||
#include <AK/FlyString.h>
|
#include <AK/FlyString.h>
|
||||||
#include <AK/StringBuilder.h>
|
#include <AK/StringBuilder.h>
|
||||||
|
#include <AK/UnicodeUtils.h>
|
||||||
#include <AK/Utf16View.h>
|
#include <AK/Utf16View.h>
|
||||||
#include <AK/Utf8View.h>
|
#include <AK/Utf8View.h>
|
||||||
#include <LibJS/Runtime/AbstractOperations.h>
|
#include <LibJS/Runtime/AbstractOperations.h>
|
||||||
|
@ -308,7 +309,7 @@ void RopeString::resolve(EncodingPreference preference) const
|
||||||
auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin();
|
auto high_surrogate = *Utf8View(previous_string_as_utf8.substring_view(previous_string_as_utf8.length() - 3)).begin();
|
||||||
auto low_surrogate = *Utf8View(current_string_as_utf8).begin();
|
auto low_surrogate = *Utf8View(current_string_as_utf8).begin();
|
||||||
|
|
||||||
if (!Utf16View::is_high_surrogate(high_surrogate) || !Utf16View::is_low_surrogate(low_surrogate)) {
|
if (!AK::UnicodeUtils::is_utf16_high_surrogate(high_surrogate) || !AK::UnicodeUtils::is_utf16_low_surrogate(low_surrogate)) {
|
||||||
builder.append(current_string_as_utf8);
|
builder.append(current_string_as_utf8);
|
||||||
previous = current;
|
previous = current;
|
||||||
continue;
|
continue;
|
||||||
|
@ -316,7 +317,7 @@ void RopeString::resolve(EncodingPreference preference) const
|
||||||
|
|
||||||
// Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point.
|
// Remove 3 bytes from the builder and replace them with the UTF-8 encoded code point.
|
||||||
builder.trim(3);
|
builder.trim(3);
|
||||||
builder.append_code_point(Utf16View::decode_surrogate_pair(high_surrogate, low_surrogate));
|
builder.append_code_point(AK::UnicodeUtils::decode_utf16_surrogate_pair(high_surrogate, low_surrogate));
|
||||||
|
|
||||||
// Append the remaining part of the current string.
|
// Append the remaining part of the current string.
|
||||||
builder.append(current_string_as_utf8.substring_view(3));
|
builder.append(current_string_as_utf8.substring_view(3));
|
||||||
|
|
|
@ -5,8 +5,8 @@
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include <AK/StringBuilder.h>
|
#include <AK/StringBuilder.h>
|
||||||
|
#include <AK/UnicodeUtils.h>
|
||||||
#include <AK/Utf16View.h>
|
#include <AK/Utf16View.h>
|
||||||
#include <AK/Utf32View.h>
|
|
||||||
#include <LibJS/Runtime/AbstractOperations.h>
|
#include <LibJS/Runtime/AbstractOperations.h>
|
||||||
#include <LibJS/Runtime/Array.h>
|
#include <LibJS/Runtime/Array.h>
|
||||||
#include <LibJS/Runtime/Error.h>
|
#include <LibJS/Runtime/Error.h>
|
||||||
|
@ -129,7 +129,9 @@ JS_DEFINE_NATIVE_FUNCTION(StringConstructor::from_code_point)
|
||||||
return vm.throw_completion<RangeError>(ErrorType::InvalidCodePoint, next_code_point.to_string_without_side_effects());
|
return vm.throw_completion<RangeError>(ErrorType::InvalidCodePoint, next_code_point.to_string_without_side_effects());
|
||||||
|
|
||||||
// d. Set result to the string-concatenation of result and UTF16EncodeCodePoint(ℝ(nextCP)).
|
// d. Set result to the string-concatenation of result and UTF16EncodeCodePoint(ℝ(nextCP)).
|
||||||
MUST(code_point_to_utf16(string, static_cast<u32>(code_point)));
|
(void)AK::UnicodeUtils::code_point_to_utf16(static_cast<u32>(code_point), [&](auto code_unit) {
|
||||||
|
string.append(code_unit);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. Assert: If codePoints is empty, then result is the empty String.
|
// 3. Assert: If codePoints is empty, then result is the empty String.
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
#include <AK/Checked.h>
|
#include <AK/Checked.h>
|
||||||
#include <AK/Function.h>
|
#include <AK/Function.h>
|
||||||
#include <AK/StringBuilder.h>
|
#include <AK/StringBuilder.h>
|
||||||
|
#include <AK/UnicodeUtils.h>
|
||||||
#include <AK/Utf16View.h>
|
#include <AK/Utf16View.h>
|
||||||
#include <LibGC/Heap.h>
|
#include <LibGC/Heap.h>
|
||||||
#include <LibJS/Runtime/AbstractOperations.h>
|
#include <LibJS/Runtime/AbstractOperations.h>
|
||||||
|
@ -121,7 +122,7 @@ CodePoint code_point_at(Utf16View const& string, size_t position)
|
||||||
}
|
}
|
||||||
|
|
||||||
// 6. If first is a trailing surrogate or position + 1 = size, then
|
// 6. If first is a trailing surrogate or position + 1 = size, then
|
||||||
if (Utf16View::is_low_surrogate(first) || (position + 1 == string.length_in_code_units())) {
|
if (AK::UnicodeUtils::is_utf16_low_surrogate(first) || (position + 1 == string.length_in_code_units())) {
|
||||||
// a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }.
|
// a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }.
|
||||||
return { true, code_point, 1 };
|
return { true, code_point, 1 };
|
||||||
}
|
}
|
||||||
|
@ -130,13 +131,13 @@ CodePoint code_point_at(Utf16View const& string, size_t position)
|
||||||
auto second = string.code_unit_at(position + 1);
|
auto second = string.code_unit_at(position + 1);
|
||||||
|
|
||||||
// 8. If second is not a trailing surrogate, then
|
// 8. If second is not a trailing surrogate, then
|
||||||
if (!Utf16View::is_low_surrogate(second)) {
|
if (!AK::UnicodeUtils::is_utf16_low_surrogate(second)) {
|
||||||
// a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }.
|
// a. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 1, [[IsUnpairedSurrogate]]: true }.
|
||||||
return { true, code_point, 1 };
|
return { true, code_point, 1 };
|
||||||
}
|
}
|
||||||
|
|
||||||
// 9. Set cp to UTF16SurrogatePairToCodePoint(first, second).
|
// 9. Set cp to UTF16SurrogatePairToCodePoint(first, second).
|
||||||
code_point = Utf16View::decode_surrogate_pair(first, second);
|
code_point = AK::UnicodeUtils::decode_utf16_surrogate_pair(first, second);
|
||||||
|
|
||||||
// 10. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 2, [[IsUnpairedSurrogate]]: false }.
|
// 10. Return the Record { [[CodePoint]]: cp, [[CodeUnitCount]]: 2, [[IsUnpairedSurrogate]]: false }.
|
||||||
return { false, code_point, 2 };
|
return { false, code_point, 2 };
|
||||||
|
|
|
@ -184,11 +184,11 @@ Optional<u32> TextEncoderStream::convert_code_unit_to_scalar_value(u32 item, Utf
|
||||||
|
|
||||||
// 3. If item is a trailing surrogate, then return a scalar value from surrogates given leadingSurrogate
|
// 3. If item is a trailing surrogate, then return a scalar value from surrogates given leadingSurrogate
|
||||||
// and item.
|
// and item.
|
||||||
if (Utf16View::is_low_surrogate(item)) {
|
if (AK::UnicodeUtils::is_utf16_low_surrogate(item)) {
|
||||||
// https://encoding.spec.whatwg.org/#scalar-value-from-surrogates
|
// https://encoding.spec.whatwg.org/#scalar-value-from-surrogates
|
||||||
// To obtain a scalar value from surrogates, given a leading surrogate leading and a trailing surrogate
|
// To obtain a scalar value from surrogates, given a leading surrogate leading and a trailing surrogate
|
||||||
// trailing, return 0x10000 + ((leading − 0xD800) << 10) + (trailing − 0xDC00).
|
// trailing, return 0x10000 + ((leading − 0xD800) << 10) + (trailing − 0xDC00).
|
||||||
return Utf16View::decode_surrogate_pair(leading_surrogate, item);
|
return AK::UnicodeUtils::decode_utf16_surrogate_pair(leading_surrogate, item);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 4. Restore item to input.
|
// 4. Restore item to input.
|
||||||
|
@ -199,13 +199,13 @@ Optional<u32> TextEncoderStream::convert_code_unit_to_scalar_value(u32 item, Utf
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. If item is a leading surrogate, then set encoder’s leading surrogate to item and return continue.
|
// 2. If item is a leading surrogate, then set encoder’s leading surrogate to item and return continue.
|
||||||
if (Utf16View::is_high_surrogate(item)) {
|
if (AK::UnicodeUtils::is_utf16_high_surrogate(item)) {
|
||||||
m_leading_surrogate = item;
|
m_leading_surrogate = item;
|
||||||
return OptionalNone {};
|
return OptionalNone {};
|
||||||
}
|
}
|
||||||
|
|
||||||
// 3. If item is a trailing surrogate, then return U+FFFD.
|
// 3. If item is a trailing surrogate, then return U+FFFD.
|
||||||
if (Utf16View::is_low_surrogate(item))
|
if (AK::UnicodeUtils::is_utf16_low_surrogate(item))
|
||||||
return 0xFFFD;
|
return 0xFFFD;
|
||||||
|
|
||||||
// 4. Return item.
|
// 4. Return item.
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue