mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-07-29 12:19:54 +00:00
AK+Everywhere: Allow lonely UTF-16 surrogates by default
By definition, the web allows lonely surrogates by default. Let's have our string APIs reflect this, so we don't have to pass an allow option all over the place.
This commit is contained in:
parent
86b1c78c1a
commit
9fc3e72db2
Notes:
github-actions[bot]
2025-07-03 13:53:17 +00:00
Author: https://github.com/trflynn89
Commit: 9fc3e72db2
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5228
Reviewed-by: https://github.com/ADKaster ✅
Reviewed-by: https://github.com/shannonbooth
14 changed files with 74 additions and 77 deletions
|
@ -85,6 +85,11 @@ constexpr inline u16 LOW_SURROGATE_MAX = 0xdfff;
|
||||||
constexpr inline u32 REPLACEMENT_CODE_POINT = 0xfffd;
|
constexpr inline u32 REPLACEMENT_CODE_POINT = 0xfffd;
|
||||||
constexpr inline u32 FIRST_SUPPLEMENTARY_PLANE_CODE_POINT = 0x10000;
|
constexpr inline u32 FIRST_SUPPLEMENTARY_PLANE_CODE_POINT = 0x10000;
|
||||||
|
|
||||||
|
enum class AllowLonelySurrogates {
|
||||||
|
No,
|
||||||
|
Yes,
|
||||||
|
};
|
||||||
|
|
||||||
[[nodiscard]] constexpr size_t code_unit_length_for_code_point(u32 code_point)
|
[[nodiscard]] constexpr size_t code_unit_length_for_code_point(u32 code_point)
|
||||||
{
|
{
|
||||||
return code_point < FIRST_SUPPLEMENTARY_PLANE_CODE_POINT ? 1uz : 2uz;
|
return code_point < FIRST_SUPPLEMENTARY_PLANE_CODE_POINT ? 1uz : 2uz;
|
||||||
|
@ -201,3 +206,7 @@ constexpr ErrorOr<size_t> try_code_point_to_utf16(u32 code_point, Callback callb
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#if USING_AK_GLOBALLY
|
||||||
|
using AK::UnicodeUtils::AllowLonelySurrogates;
|
||||||
|
#endif
|
||||||
|
|
|
@ -46,7 +46,7 @@ ErrorOr<Utf16ConversionResult> utf8_to_utf16(Utf8View const& utf8_view)
|
||||||
return Utf16ConversionResult { Utf16Data {}, 0 };
|
return Utf16ConversionResult { Utf16Data {}, 0 };
|
||||||
|
|
||||||
// All callers want to allow lonely surrogates, which simdutf does not permit.
|
// All callers want to allow lonely surrogates, which simdutf does not permit.
|
||||||
if (!utf8_view.validate(Utf8View::AllowSurrogates::No)) [[unlikely]]
|
if (!utf8_view.validate(AllowLonelySurrogates::No)) [[unlikely]]
|
||||||
return to_utf16_slow(utf8_view);
|
return to_utf16_slow(utf8_view);
|
||||||
|
|
||||||
auto const* data = reinterpret_cast<char const*>(utf8_view.bytes());
|
auto const* data = reinterpret_cast<char const*>(utf8_view.bytes());
|
||||||
|
@ -95,14 +95,14 @@ size_t utf16_code_unit_length_from_utf8(StringView string)
|
||||||
return simdutf::utf16_length_from_utf8(string.characters_without_null_termination(), string.length());
|
return simdutf::utf16_length_from_utf8(string.characters_without_null_termination(), string.length());
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_units) const
|
ErrorOr<String> Utf16View::to_utf8(AllowLonelySurrogates allow_lonely_surrogates) const
|
||||||
{
|
{
|
||||||
if (is_empty())
|
if (is_empty())
|
||||||
return String {};
|
return String {};
|
||||||
if (!validate(allow_invalid_code_units))
|
if (!validate(allow_lonely_surrogates))
|
||||||
return Error::from_string_literal("Input was not valid UTF-16");
|
return Error::from_string_literal("Input was not valid UTF-16");
|
||||||
|
|
||||||
if (allow_invalid_code_units == AllowInvalidCodeUnits::No) {
|
if (allow_lonely_surrogates == AllowLonelySurrogates::No) {
|
||||||
String result;
|
String result;
|
||||||
auto utf8_length = simdutf::utf8_length_from_utf16(m_string, length_in_code_units());
|
auto utf8_length = simdutf::utf8_length_from_utf16(m_string, length_in_code_units());
|
||||||
|
|
||||||
|
@ -120,9 +120,9 @@ ErrorOr<String> Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_unit
|
||||||
return builder.to_string();
|
return builder.to_string();
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<ByteString> Utf16View::to_byte_string(AllowInvalidCodeUnits allow_invalid_code_units) const
|
ErrorOr<ByteString> Utf16View::to_byte_string(AllowLonelySurrogates allow_lonely_surrogates) const
|
||||||
{
|
{
|
||||||
return TRY(to_utf8(allow_invalid_code_units)).to_byte_string();
|
return TRY(to_utf8(allow_lonely_surrogates)).to_byte_string();
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Utf16View::is_ascii() const
|
bool Utf16View::is_ascii() const
|
||||||
|
@ -130,7 +130,7 @@ bool Utf16View::is_ascii() const
|
||||||
return simdutf::validate_ascii(reinterpret_cast<char const*>(m_string), length_in_code_units() * sizeof(char16_t));
|
return simdutf::validate_ascii(reinterpret_cast<char const*>(m_string), length_in_code_units() * sizeof(char16_t));
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Utf16View::validate(size_t& valid_code_units, AllowInvalidCodeUnits allow_invalid_code_units) const
|
bool Utf16View::validate(size_t& valid_code_units, AllowLonelySurrogates allow_lonely_surrogates) const
|
||||||
{
|
{
|
||||||
auto view = *this;
|
auto view = *this;
|
||||||
valid_code_units = 0;
|
valid_code_units = 0;
|
||||||
|
@ -141,7 +141,7 @@ bool Utf16View::validate(size_t& valid_code_units, AllowInvalidCodeUnits allow_i
|
||||||
|
|
||||||
if (result.error == simdutf::SUCCESS)
|
if (result.error == simdutf::SUCCESS)
|
||||||
return true;
|
return true;
|
||||||
if (allow_invalid_code_units == AllowInvalidCodeUnits::No || result.error != simdutf::SURROGATE)
|
if (allow_lonely_surrogates == AllowLonelySurrogates::No || result.error != simdutf::SURROGATE)
|
||||||
return false;
|
return false;
|
||||||
|
|
||||||
view = view.substring_view(result.count + 1);
|
view = view.substring_view(result.count + 1);
|
||||||
|
@ -219,10 +219,8 @@ Utf16View Utf16View::unicode_substring_view(size_t code_point_offset, size_t cod
|
||||||
|
|
||||||
size_t Utf16View::calculate_length_in_code_points() const
|
size_t Utf16View::calculate_length_in_code_points() const
|
||||||
{
|
{
|
||||||
// FIXME: simdutf's code point length method assumes valid UTF-16, whereas Utf16View uses U+FFFD as a replacement
|
// simdutf's code point length method assumes valid UTF-16, whereas we allow lonely surrogates.
|
||||||
// for invalid code points. If we change Utf16View to only accept valid encodings as an invariant, we can
|
if (validate(AllowLonelySurrogates::No)) [[likely]]
|
||||||
// remove this branch.
|
|
||||||
if (validate()) [[likely]]
|
|
||||||
return simdutf::count_utf16(m_string, length_in_code_units());
|
return simdutf::count_utf16(m_string, length_in_code_units());
|
||||||
|
|
||||||
size_t code_points = 0;
|
size_t code_points = 0;
|
||||||
|
|
|
@ -102,11 +102,6 @@ class Utf16View {
|
||||||
public:
|
public:
|
||||||
using Iterator = Utf16CodePointIterator;
|
using Iterator = Utf16CodePointIterator;
|
||||||
|
|
||||||
enum class AllowInvalidCodeUnits {
|
|
||||||
No,
|
|
||||||
Yes,
|
|
||||||
};
|
|
||||||
|
|
||||||
Utf16View() = default;
|
Utf16View() = default;
|
||||||
~Utf16View() = default;
|
~Utf16View() = default;
|
||||||
|
|
||||||
|
@ -130,8 +125,8 @@ public:
|
||||||
{
|
{
|
||||||
}
|
}
|
||||||
|
|
||||||
ErrorOr<String> to_utf8(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
|
ErrorOr<String> to_utf8(AllowLonelySurrogates = AllowLonelySurrogates::Yes) const;
|
||||||
ErrorOr<ByteString> to_byte_string(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
|
ErrorOr<ByteString> to_byte_string(AllowLonelySurrogates = AllowLonelySurrogates::Yes) const;
|
||||||
|
|
||||||
[[nodiscard]] constexpr ReadonlySpan<char16_t> span() const
|
[[nodiscard]] constexpr ReadonlySpan<char16_t> span() const
|
||||||
{
|
{
|
||||||
|
@ -187,13 +182,13 @@ public:
|
||||||
[[nodiscard]] constexpr bool is_empty() const { return length_in_code_units() == 0; }
|
[[nodiscard]] constexpr bool is_empty() const { return length_in_code_units() == 0; }
|
||||||
[[nodiscard]] bool is_ascii() const;
|
[[nodiscard]] bool is_ascii() const;
|
||||||
|
|
||||||
[[nodiscard]] ALWAYS_INLINE bool validate(AllowInvalidCodeUnits allow_invalid_code_units = AllowInvalidCodeUnits::No) const
|
[[nodiscard]] ALWAYS_INLINE bool validate(AllowLonelySurrogates allow_lonely_surrogates = AllowLonelySurrogates::Yes) const
|
||||||
{
|
{
|
||||||
size_t valid_code_units = 0;
|
size_t valid_code_units = 0;
|
||||||
return validate(valid_code_units, allow_invalid_code_units);
|
return validate(valid_code_units, allow_lonely_surrogates);
|
||||||
}
|
}
|
||||||
|
|
||||||
[[nodiscard]] bool validate(size_t& valid_code_units, AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
|
[[nodiscard]] bool validate(size_t& valid_code_units, AllowLonelySurrogates = AllowLonelySurrogates::Yes) const;
|
||||||
|
|
||||||
[[nodiscard]] constexpr size_t length_in_code_units() const { return m_length_in_code_units; }
|
[[nodiscard]] constexpr size_t length_in_code_units() const { return m_length_in_code_units; }
|
||||||
|
|
||||||
|
|
|
@ -185,16 +185,16 @@ Utf8View Utf8View::trim(Utf8View const& characters, TrimMode mode) const
|
||||||
return substring_view(substring_start, substring_length);
|
return substring_view(substring_start, substring_length);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Utf8View::validate(size_t& valid_bytes, AllowSurrogates allow_surrogates) const
|
bool Utf8View::validate(size_t& valid_bytes, AllowLonelySurrogates allow_lonely_surrogates) const
|
||||||
{
|
{
|
||||||
auto result = simdutf::validate_utf8_with_errors(m_string.characters_without_null_termination(), m_string.length());
|
auto result = simdutf::validate_utf8_with_errors(m_string.characters_without_null_termination(), m_string.length());
|
||||||
valid_bytes = result.count;
|
valid_bytes = result.count;
|
||||||
|
|
||||||
if (result.error == simdutf::SURROGATE && allow_surrogates == AllowSurrogates::Yes) {
|
if (result.error == simdutf::SURROGATE && allow_lonely_surrogates == AllowLonelySurrogates::Yes) {
|
||||||
valid_bytes += 3; // All surrogates have a UTF-8 byte length of 3.
|
valid_bytes += 3; // All surrogates have a UTF-8 byte length of 3.
|
||||||
|
|
||||||
size_t substring_valid_bytes = 0;
|
size_t substring_valid_bytes = 0;
|
||||||
auto is_valid = substring_view(valid_bytes).validate(substring_valid_bytes, allow_surrogates);
|
auto is_valid = substring_view(valid_bytes).validate(substring_valid_bytes, allow_lonely_surrogates);
|
||||||
|
|
||||||
valid_bytes += substring_valid_bytes;
|
valid_bytes += substring_valid_bytes;
|
||||||
return is_valid;
|
return is_valid;
|
||||||
|
|
|
@ -13,6 +13,7 @@
|
||||||
#include <AK/Function.h>
|
#include <AK/Function.h>
|
||||||
#include <AK/StringView.h>
|
#include <AK/StringView.h>
|
||||||
#include <AK/Types.h>
|
#include <AK/Types.h>
|
||||||
|
#include <AK/UnicodeUtils.h>
|
||||||
|
|
||||||
namespace AK {
|
namespace AK {
|
||||||
|
|
||||||
|
@ -77,12 +78,6 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
explicit Utf8View(ByteString&&) = delete;
|
explicit Utf8View(ByteString&&) = delete;
|
||||||
|
|
||||||
enum class AllowSurrogates {
|
|
||||||
Yes,
|
|
||||||
No,
|
|
||||||
};
|
|
||||||
|
|
||||||
~Utf8View() = default;
|
~Utf8View() = default;
|
||||||
|
|
||||||
StringView as_string() const { return m_string; }
|
StringView as_string() const { return m_string; }
|
||||||
|
@ -135,13 +130,13 @@ public:
|
||||||
return m_length;
|
return m_length;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool validate(AllowSurrogates allow_surrogates = AllowSurrogates::Yes) const
|
bool validate(AllowLonelySurrogates allow_lonely_surrogates = AllowLonelySurrogates::Yes) const
|
||||||
{
|
{
|
||||||
size_t valid_bytes = 0;
|
size_t valid_bytes = 0;
|
||||||
return validate(valid_bytes, allow_surrogates);
|
return validate(valid_bytes, allow_lonely_surrogates);
|
||||||
}
|
}
|
||||||
|
|
||||||
bool validate(size_t& valid_bytes, AllowSurrogates allow_surrogates = AllowSurrogates::Yes) const;
|
bool validate(size_t& valid_bytes, AllowLonelySurrogates allow_lonely_surrogates = AllowLonelySurrogates::Yes) const;
|
||||||
|
|
||||||
template<typename Callback>
|
template<typename Callback>
|
||||||
auto for_each_split_view(Function<bool(u32)> splitter, SplitBehavior split_behavior, Callback callback) const
|
auto for_each_split_view(Function<bool(u32)> splitter, SplitBehavior split_behavior, Callback callback) const
|
||||||
|
|
|
@ -1393,7 +1393,7 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
|
||||||
|
|
||||||
// 2. Let groupName be the substring of templateRemainder from 2 to gtPos.
|
// 2. Let groupName be the substring of templateRemainder from 2 to gtPos.
|
||||||
auto group_name_view = template_remainder.substring_view(2, *greater_than_position - 2);
|
auto group_name_view = template_remainder.substring_view(2, *greater_than_position - 2);
|
||||||
auto group_name = MUST(group_name_view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
|
auto group_name = MUST(group_name_view.to_utf8());
|
||||||
|
|
||||||
// 3. Assert: namedCaptures is an Object.
|
// 3. Assert: namedCaptures is an Object.
|
||||||
VERIFY(named_captures.is_object());
|
VERIFY(named_captures.is_object());
|
||||||
|
@ -1435,7 +1435,7 @@ ThrowCompletionOr<String> get_substitution(VM& vm, Utf16View const& matched, Utf
|
||||||
}
|
}
|
||||||
|
|
||||||
// 6. Return result.
|
// 6. Return result.
|
||||||
return MUST(Utf16View { result }.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
|
return MUST(Utf16View { result }.to_utf8());
|
||||||
}
|
}
|
||||||
|
|
||||||
void DisposeCapability::visit_edges(GC::Cell::Visitor& visitor) const
|
void DisposeCapability::visit_edges(GC::Cell::Visitor& visitor) const
|
||||||
|
|
|
@ -99,7 +99,7 @@ Optional<size_t> string_index_of(Utf16View const& string, Utf16View const& searc
|
||||||
static bool is_string_well_formed_unicode(Utf16View string)
|
static bool is_string_well_formed_unicode(Utf16View string)
|
||||||
{
|
{
|
||||||
// OPTIMIZATION: simdutf can do this much faster.
|
// OPTIMIZATION: simdutf can do this much faster.
|
||||||
return string.validate();
|
return string.validate(AllowLonelySurrogates::No);
|
||||||
}
|
}
|
||||||
|
|
||||||
// 11.1.4 CodePointAt ( string, position ), https://tc39.es/ecma262/#sec-codepointat
|
// 11.1.4 CodePointAt ( string, position ), https://tc39.es/ecma262/#sec-codepointat
|
||||||
|
|
|
@ -125,12 +125,12 @@ Utf16View Utf16String::substring_view(size_t code_unit_offset) const
|
||||||
|
|
||||||
String Utf16String::to_utf8() const
|
String Utf16String::to_utf8() const
|
||||||
{
|
{
|
||||||
return MUST(view().to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
|
return MUST(view().to_utf8());
|
||||||
}
|
}
|
||||||
|
|
||||||
ByteString Utf16String::to_byte_string() const
|
ByteString Utf16String::to_byte_string() const
|
||||||
{
|
{
|
||||||
return MUST(view().to_byte_string(Utf16View::AllowInvalidCodeUnits::Yes));
|
return MUST(view().to_byte_string());
|
||||||
}
|
}
|
||||||
|
|
||||||
u16 Utf16String::code_unit_at(size_t index) const
|
u16 Utf16String::code_unit_at(size_t index) const
|
||||||
|
|
|
@ -181,7 +181,7 @@ public:
|
||||||
{
|
{
|
||||||
return m_view.visit(
|
return m_view.visit(
|
||||||
[](StringView view) { return view.to_byte_string(); },
|
[](StringView view) { return view.to_byte_string(); },
|
||||||
[](Utf16View view) { return view.to_byte_string(Utf16View::AllowInvalidCodeUnits::Yes).release_value_but_fixme_should_propagate_errors(); },
|
[](Utf16View view) { return view.to_byte_string().release_value_but_fixme_should_propagate_errors(); },
|
||||||
[](auto& view) {
|
[](auto& view) {
|
||||||
StringBuilder builder;
|
StringBuilder builder;
|
||||||
for (auto it = view.begin(); it != view.end(); ++it)
|
for (auto it = view.begin(); it != view.end(); ++it)
|
||||||
|
@ -194,7 +194,7 @@ public:
|
||||||
{
|
{
|
||||||
return m_view.visit(
|
return m_view.visit(
|
||||||
[](StringView view) { return String::from_utf8(view); },
|
[](StringView view) { return String::from_utf8(view); },
|
||||||
[](Utf16View view) { return view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes); },
|
[](Utf16View view) { return view.to_utf8(); },
|
||||||
[](auto& view) -> ErrorOr<String> {
|
[](auto& view) -> ErrorOr<String> {
|
||||||
StringBuilder builder;
|
StringBuilder builder;
|
||||||
for (auto it = view.begin(); it != view.end(); ++it)
|
for (auto it = view.begin(); it != view.end(); ++it)
|
||||||
|
|
|
@ -93,7 +93,7 @@ static ParseResult<ByteString> parse_name(Stream& stream)
|
||||||
ScopeLogger<WASM_BINPARSER_DEBUG> logger;
|
ScopeLogger<WASM_BINPARSER_DEBUG> logger;
|
||||||
auto data = TRY(parse_vector<u8>(stream));
|
auto data = TRY(parse_vector<u8>(stream));
|
||||||
auto string = ByteString::copy(data);
|
auto string = ByteString::copy(data);
|
||||||
if (!Utf8View(string).validate(Utf8View::AllowSurrogates::No))
|
if (!Utf8View(string).validate(AllowLonelySurrogates::No))
|
||||||
return ParseError::InvalidUtf8;
|
return ParseError::InvalidUtf8;
|
||||||
return string;
|
return string;
|
||||||
}
|
}
|
||||||
|
|
|
@ -57,10 +57,10 @@ WebIDL::ExceptionOr<String> CharacterData::substring_data(size_t offset, size_t
|
||||||
// 3. If offset plus count is greater than length, return a string whose value is the code units from the offsetth code unit
|
// 3. If offset plus count is greater than length, return a string whose value is the code units from the offsetth code unit
|
||||||
// to the end of node’s data, and then return.
|
// to the end of node’s data, and then return.
|
||||||
if (offset + count > length)
|
if (offset + count > length)
|
||||||
return MUST(utf16_view.substring_view(offset).to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
|
return MUST(utf16_view.substring_view(offset).to_utf8());
|
||||||
|
|
||||||
// 4. Return a string whose value is the code units from the offsetth code unit to the offset+countth code unit in node’s data.
|
// 4. Return a string whose value is the code units from the offsetth code unit to the offset+countth code unit in node’s data.
|
||||||
return MUST(utf16_view.substring_view(offset, count).to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
|
return MUST(utf16_view.substring_view(offset, count).to_utf8());
|
||||||
}
|
}
|
||||||
|
|
||||||
// https://dom.spec.whatwg.org/#concept-cd-replace
|
// https://dom.spec.whatwg.org/#concept-cd-replace
|
||||||
|
@ -99,7 +99,7 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
|
||||||
|
|
||||||
// OPTIMIZATION: Skip UTF-8 encoding if the characters are the same.
|
// OPTIMIZATION: Skip UTF-8 encoding if the characters are the same.
|
||||||
if (!characters_are_the_same) {
|
if (!characters_are_the_same) {
|
||||||
m_data = MUST(full_view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
|
m_data = MUST(full_view.to_utf8());
|
||||||
}
|
}
|
||||||
|
|
||||||
// 4. Queue a mutation record of "characterData" for node with null, null, node’s data, « », « », null, and null.
|
// 4. Queue a mutation record of "characterData" for node with null, null, node’s data, « », « », null, and null.
|
||||||
|
|
|
@ -110,7 +110,7 @@ WebIDL::ExceptionOr<FileReader::Result> FileReader::blob_package_data(JS::Realm&
|
||||||
builder.ensure_capacity(bytes.size());
|
builder.ensure_capacity(bytes.size());
|
||||||
for (auto byte : bytes.bytes())
|
for (auto byte : bytes.bytes())
|
||||||
builder.unchecked_append(byte);
|
builder.unchecked_append(byte);
|
||||||
return MUST(Utf16View { builder }.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
|
return MUST(Utf16View { builder }.to_utf8());
|
||||||
}
|
}
|
||||||
VERIFY_NOT_REACHED();
|
VERIFY_NOT_REACHED();
|
||||||
}
|
}
|
||||||
|
|
|
@ -56,13 +56,13 @@ TEST_CASE(encode_utf8)
|
||||||
auto utf8_string = "Привет, мир! 😀 γειά σου κόσμος こんにちは世界"_string;
|
auto utf8_string = "Привет, мир! 😀 γειά σου κόσμος こんにちは世界"_string;
|
||||||
auto string = MUST(AK::utf8_to_utf16(utf8_string));
|
auto string = MUST(AK::utf8_to_utf16(utf8_string));
|
||||||
Utf16View view { string };
|
Utf16View view { string };
|
||||||
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), utf8_string);
|
EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::Yes)), utf8_string);
|
||||||
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No)), utf8_string);
|
EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::No)), utf8_string);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
Utf16View view { u"\xd83d"sv };
|
Utf16View view { u"\xd83d"sv };
|
||||||
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), "\xed\xa0\xbd"sv);
|
EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::Yes)), "\xed\xa0\xbd"sv);
|
||||||
EXPECT(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No).is_error());
|
EXPECT(view.to_utf8(AllowLonelySurrogates::No).is_error());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -99,8 +99,8 @@ TEST_CASE(null_view)
|
||||||
EXPECT(view.validate());
|
EXPECT(view.validate());
|
||||||
EXPECT_EQ(view.length_in_code_units(), 0zu);
|
EXPECT_EQ(view.length_in_code_units(), 0zu);
|
||||||
EXPECT_EQ(view.length_in_code_points(), 0zu);
|
EXPECT_EQ(view.length_in_code_points(), 0zu);
|
||||||
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No)), ""sv);
|
EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::No)), ""sv);
|
||||||
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), ""sv);
|
EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::Yes)), ""sv);
|
||||||
|
|
||||||
for ([[maybe_unused]] auto it : view)
|
for ([[maybe_unused]] auto it : view)
|
||||||
FAIL("Iterating a null UTF-16 string should not produce any values");
|
FAIL("Iterating a null UTF-16 string should not produce any values");
|
||||||
|
@ -187,81 +187,81 @@ TEST_CASE(validate_invalid_utf16)
|
||||||
{
|
{
|
||||||
// Lonely high surrogate.
|
// Lonely high surrogate.
|
||||||
invalid = u"\xd800"sv;
|
invalid = u"\xd800"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
|
||||||
EXPECT_EQ(valid_code_units, 0uz);
|
EXPECT_EQ(valid_code_units, 0uz);
|
||||||
|
|
||||||
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
|
||||||
EXPECT_EQ(valid_code_units, 1uz);
|
EXPECT_EQ(valid_code_units, 1uz);
|
||||||
|
|
||||||
invalid = u"\xdbff"sv;
|
invalid = u"\xdbff"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
|
||||||
EXPECT_EQ(valid_code_units, 0uz);
|
EXPECT_EQ(valid_code_units, 0uz);
|
||||||
|
|
||||||
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
|
||||||
EXPECT_EQ(valid_code_units, 1uz);
|
EXPECT_EQ(valid_code_units, 1uz);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// Lonely low surrogate.
|
// Lonely low surrogate.
|
||||||
invalid = u"\xdc00"sv;
|
invalid = u"\xdc00"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
|
||||||
EXPECT_EQ(valid_code_units, 0uz);
|
EXPECT_EQ(valid_code_units, 0uz);
|
||||||
|
|
||||||
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
|
||||||
EXPECT_EQ(valid_code_units, 1uz);
|
EXPECT_EQ(valid_code_units, 1uz);
|
||||||
|
|
||||||
invalid = u"\xdfff"sv;
|
invalid = u"\xdfff"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
|
||||||
EXPECT_EQ(valid_code_units, 0uz);
|
EXPECT_EQ(valid_code_units, 0uz);
|
||||||
|
|
||||||
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
|
||||||
EXPECT_EQ(valid_code_units, 1uz);
|
EXPECT_EQ(valid_code_units, 1uz);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// High surrogate followed by non-surrogate.
|
// High surrogate followed by non-surrogate.
|
||||||
invalid = u"\xd800\x0000"sv;
|
invalid = u"\xd800\x0000"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
|
||||||
EXPECT_EQ(valid_code_units, 0uz);
|
EXPECT_EQ(valid_code_units, 0uz);
|
||||||
|
|
||||||
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
|
||||||
EXPECT_EQ(valid_code_units, 2uz);
|
EXPECT_EQ(valid_code_units, 2uz);
|
||||||
|
|
||||||
invalid = u"\xd800\xe000"sv;
|
invalid = u"\xd800\xe000"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
|
||||||
EXPECT_EQ(valid_code_units, 0uz);
|
EXPECT_EQ(valid_code_units, 0uz);
|
||||||
|
|
||||||
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
|
||||||
EXPECT_EQ(valid_code_units, 2uz);
|
EXPECT_EQ(valid_code_units, 2uz);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// High surrogate followed by high surrogate.
|
// High surrogate followed by high surrogate.
|
||||||
invalid = u"\xd800\xd800"sv;
|
invalid = u"\xd800\xd800"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
|
||||||
EXPECT_EQ(valid_code_units, 0uz);
|
EXPECT_EQ(valid_code_units, 0uz);
|
||||||
|
|
||||||
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
|
||||||
EXPECT_EQ(valid_code_units, 2uz);
|
EXPECT_EQ(valid_code_units, 2uz);
|
||||||
|
|
||||||
invalid = u"\xd800\xdbff"sv;
|
invalid = u"\xd800\xdbff"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
|
||||||
EXPECT_EQ(valid_code_units, 0uz);
|
EXPECT_EQ(valid_code_units, 0uz);
|
||||||
|
|
||||||
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
|
||||||
EXPECT_EQ(valid_code_units, 2uz);
|
EXPECT_EQ(valid_code_units, 2uz);
|
||||||
}
|
}
|
||||||
{
|
{
|
||||||
// Valid UTF-16 followed by invalid code units.
|
// Valid UTF-16 followed by invalid code units.
|
||||||
invalid = u"\x0041\x0041\xd800"sv;
|
invalid = u"\x0041\x0041\xd800"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
|
||||||
EXPECT_EQ(valid_code_units, 2uz);
|
EXPECT_EQ(valid_code_units, 2uz);
|
||||||
|
|
||||||
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
|
||||||
EXPECT_EQ(valid_code_units, 3uz);
|
EXPECT_EQ(valid_code_units, 3uz);
|
||||||
|
|
||||||
invalid = u"\x0041\x0041\xd800"sv;
|
invalid = u"\x0041\x0041\xd800"sv;
|
||||||
EXPECT(!invalid.validate(valid_code_units));
|
EXPECT(!invalid.validate(valid_code_units, AllowLonelySurrogates::No));
|
||||||
EXPECT_EQ(valid_code_units, 2uz);
|
EXPECT_EQ(valid_code_units, 2uz);
|
||||||
|
|
||||||
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
|
EXPECT(invalid.validate(valid_code_units, AllowLonelySurrogates::Yes));
|
||||||
EXPECT_EQ(valid_code_units, 3uz);
|
EXPECT_EQ(valid_code_units, 3uz);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -368,8 +368,8 @@ TEST_CASE(substring_view)
|
||||||
view = view.substring_view(7, 1);
|
view = view.substring_view(7, 1);
|
||||||
|
|
||||||
EXPECT(view.length_in_code_units() == 1);
|
EXPECT(view.length_in_code_units() == 1);
|
||||||
EXPECT_EQ(MUST(view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)), "\xed\xa0\xbd"sv);
|
EXPECT_EQ(MUST(view.to_utf8(AllowLonelySurrogates::Yes)), "\xed\xa0\xbd"sv);
|
||||||
EXPECT(view.to_utf8(Utf16View::AllowInvalidCodeUnits::No).is_error());
|
EXPECT(view.to_utf8(AllowLonelySurrogates::No).is_error());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -51,8 +51,8 @@ TEST_CASE(decode_utf8)
|
||||||
TEST_CASE(null_view)
|
TEST_CASE(null_view)
|
||||||
{
|
{
|
||||||
Utf8View view;
|
Utf8View view;
|
||||||
EXPECT(view.validate(Utf8View::AllowSurrogates::No));
|
EXPECT(view.validate(AllowLonelySurrogates::No));
|
||||||
EXPECT(view.validate(Utf8View::AllowSurrogates::Yes));
|
EXPECT(view.validate(AllowLonelySurrogates::Yes));
|
||||||
EXPECT_EQ(view.byte_length(), 0zu);
|
EXPECT_EQ(view.byte_length(), 0zu);
|
||||||
EXPECT_EQ(view.length(), 0zu);
|
EXPECT_EQ(view.length(), 0zu);
|
||||||
|
|
||||||
|
@ -95,7 +95,7 @@ TEST_CASE(validate_invalid_ut8)
|
||||||
|
|
||||||
char invalid_utf8_7[] = { (char)0xed, (char)0xa0, (char)0x80 }; // U+d800
|
char invalid_utf8_7[] = { (char)0xed, (char)0xa0, (char)0x80 }; // U+d800
|
||||||
Utf8View utf8_7 { StringView { invalid_utf8_7, 3 } };
|
Utf8View utf8_7 { StringView { invalid_utf8_7, 3 } };
|
||||||
EXPECT(!utf8_7.validate(valid_bytes, Utf8View::AllowSurrogates::No));
|
EXPECT(!utf8_7.validate(valid_bytes, AllowLonelySurrogates::No));
|
||||||
EXPECT(valid_bytes == 0);
|
EXPECT(valid_bytes == 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue