AK: Allow treating UTF-16 views with lonely surrogates as valid

Much of the web requires us to allow lonely surrogates in UTF-16 data.
The default behavior to disallow such code units has not been changed
here - that will be changed in an upcoming commit.
This commit is contained in:
Timothy Flynn 2025-06-18 11:04:18 -04:00 committed by Tim Flynn
commit 2abc955ca9
Notes: github-actions[bot] 2025-07-03 13:53:41 +00:00
3 changed files with 84 additions and 38 deletions

View file

@ -307,16 +307,31 @@ bool Utf16View::is_code_unit_less_than(Utf16View const& other) const
return a.size() < b.size(); return a.size() < b.size();
} }
bool Utf16View::validate() const bool Utf16View::validate(AllowInvalidCodeUnits allow_invalid_code_units) const
{ {
return simdutf::validate_utf16(char_data(), length_in_code_units()); size_t valid_code_units = 0;
return validate(valid_code_units, allow_invalid_code_units);
} }
bool Utf16View::validate(size_t& valid_code_units) const bool Utf16View::validate(size_t& valid_code_units, AllowInvalidCodeUnits allow_invalid_code_units) const
{ {
auto result = simdutf::validate_utf16_with_errors(char_data(), length_in_code_units()); auto view = *this;
valid_code_units = result.count; valid_code_units = 0;
return result.error == simdutf::SUCCESS;
while (!view.is_empty()) {
auto result = simdutf::validate_utf16_with_errors(view.char_data(), view.length_in_code_units());
valid_code_units += result.count;
if (result.error == simdutf::SUCCESS)
return true;
if (allow_invalid_code_units == AllowInvalidCodeUnits::No || result.error != simdutf::SURROGATE)
return false;
view = view.substring_view(result.count + 1);
++valid_code_units;
}
return true;
} }
size_t Utf16View::calculate_length_in_code_points() const size_t Utf16View::calculate_length_in_code_points() const

View file

@ -148,8 +148,8 @@ public:
bool starts_with(Utf16View const&) const; bool starts_with(Utf16View const&) const;
bool is_code_unit_less_than(Utf16View const& other) const; bool is_code_unit_less_than(Utf16View const& other) const;
bool validate() const; bool validate(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
bool validate(size_t& valid_code_units) const; bool validate(size_t& valid_code_units, AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const;
bool equals_ignoring_case(Utf16View const&) const; bool equals_ignoring_case(Utf16View const&) const;

View file

@ -187,55 +187,86 @@ TEST_CASE(iterate_utf16)
TEST_CASE(validate_invalid_utf16) TEST_CASE(validate_invalid_utf16)
{ {
size_t valid_code_units = 0; size_t valid_code_units = 0;
Utf16View invalid;
{ {
// Lonely high surrogate. // Lonely high surrogate.
auto invalid = Array { (u16)0xd800 }; invalid = u"\xd800";
EXPECT(!Utf16View(invalid).validate(valid_code_units)); EXPECT(!invalid.validate(valid_code_units));
EXPECT(valid_code_units == 0); EXPECT_EQ(valid_code_units, 0uz);
invalid = Array { (u16)0xdbff }; EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
EXPECT(!Utf16View(invalid).validate(valid_code_units)); EXPECT_EQ(valid_code_units, 1uz);
EXPECT(valid_code_units == 0);
invalid = u"\xdbff";
EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz);
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
EXPECT_EQ(valid_code_units, 1uz);
} }
{ {
// Lonely low surrogate. // Lonely low surrogate.
auto invalid = Array { (u16)0xdc00 }; invalid = u"\xdc00";
EXPECT(!Utf16View(invalid).validate(valid_code_units)); EXPECT(!invalid.validate(valid_code_units));
EXPECT(valid_code_units == 0); EXPECT_EQ(valid_code_units, 0uz);
invalid = Array { (u16)0xdfff }; EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
EXPECT(!Utf16View(invalid).validate(valid_code_units)); EXPECT_EQ(valid_code_units, 1uz);
EXPECT(valid_code_units == 0);
invalid = u"\xdfff";
EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz);
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
EXPECT_EQ(valid_code_units, 1uz);
} }
{ {
// High surrogate followed by non-surrogate. // High surrogate followed by non-surrogate.
auto invalid = Array { (u16)0xd800, 0 }; invalid = u"\xd800\x0000";
EXPECT(!Utf16View(invalid).validate(valid_code_units)); EXPECT(!invalid.validate(valid_code_units));
EXPECT(valid_code_units == 0); EXPECT_EQ(valid_code_units, 0uz);
invalid = Array { (u16)0xd800, 0xe000 }; EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
EXPECT(!Utf16View(invalid).validate(valid_code_units)); EXPECT_EQ(valid_code_units, 2uz);
EXPECT(valid_code_units == 0);
invalid = u"\xd800\xe000";
EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz);
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
EXPECT_EQ(valid_code_units, 2uz);
} }
{ {
// High surrogate followed by high surrogate. // High surrogate followed by high surrogate.
auto invalid = Array { (u16)0xd800, 0xd800 }; invalid = u"\xd800\xd800";
EXPECT(!Utf16View(invalid).validate(valid_code_units)); EXPECT(!invalid.validate(valid_code_units));
EXPECT(valid_code_units == 0); EXPECT_EQ(valid_code_units, 0uz);
invalid = Array { (u16)0xd800, 0xdbff }; EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
EXPECT(!Utf16View(invalid).validate(valid_code_units)); EXPECT_EQ(valid_code_units, 2uz);
EXPECT(valid_code_units == 0);
invalid = u"\xd800\xdbff";
EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 0uz);
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
EXPECT_EQ(valid_code_units, 2uz);
} }
{ {
// Valid UTF-16 followed by invalid code units. // Valid UTF-16 followed by invalid code units.
auto invalid = Array { (u16)0x41, 0x41, 0xd800 }; invalid = u"\x0041\x0041\xd800";
EXPECT(!Utf16View(invalid).validate(valid_code_units)); EXPECT(!invalid.validate(valid_code_units));
EXPECT(valid_code_units == 2); EXPECT_EQ(valid_code_units, 2uz);
invalid = Array { (u16)0x41, 0x41, 0xd800 }; EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
EXPECT(!Utf16View(invalid).validate(valid_code_units)); EXPECT_EQ(valid_code_units, 3uz);
EXPECT(valid_code_units == 2);
invalid = u"\x0041\x0041\xd800";
EXPECT(!invalid.validate(valid_code_units));
EXPECT_EQ(valid_code_units, 2uz);
EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes));
EXPECT_EQ(valid_code_units, 3uz);
} }
} }