From 2abc955ca96e7e575ac5323c09f15771a60cfada Mon Sep 17 00:00:00 2001 From: Timothy Flynn Date: Wed, 18 Jun 2025 11:04:18 -0400 Subject: [PATCH] AK: Allow treating UTF-16 views with lonely surrogates as valid Much of the web requires us to allow lonely surrogates in UTF-16 data. The default behavior to disallow such code units has not been changed here - that will be changed in an upcoming commit. --- AK/Utf16View.cpp | 27 ++++++++--- AK/Utf16View.h | 4 +- Tests/AK/TestUtf16View.cpp | 91 +++++++++++++++++++++++++------------- 3 files changed, 84 insertions(+), 38 deletions(-) diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index c4ce147a5b7..e7b09f7140a 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -307,16 +307,31 @@ bool Utf16View::is_code_unit_less_than(Utf16View const& other) const return a.size() < b.size(); } -bool Utf16View::validate() const +bool Utf16View::validate(AllowInvalidCodeUnits allow_invalid_code_units) const { - return simdutf::validate_utf16(char_data(), length_in_code_units()); + size_t valid_code_units = 0; + return validate(valid_code_units, allow_invalid_code_units); } -bool Utf16View::validate(size_t& valid_code_units) const +bool Utf16View::validate(size_t& valid_code_units, AllowInvalidCodeUnits allow_invalid_code_units) const { - auto result = simdutf::validate_utf16_with_errors(char_data(), length_in_code_units()); - valid_code_units = result.count; - return result.error == simdutf::SUCCESS; + auto view = *this; + valid_code_units = 0; + + while (!view.is_empty()) { + auto result = simdutf::validate_utf16_with_errors(view.char_data(), view.length_in_code_units()); + valid_code_units += result.count; + + if (result.error == simdutf::SUCCESS) + return true; + if (allow_invalid_code_units == AllowInvalidCodeUnits::No || result.error != simdutf::SURROGATE) + return false; + + view = view.substring_view(result.count + 1); + ++valid_code_units; + } + + return true; } size_t Utf16View::calculate_length_in_code_points() const diff --git a/AK/Utf16View.h b/AK/Utf16View.h index 5a735fd5b9c..986ca69d46a 100644 --- a/AK/Utf16View.h +++ b/AK/Utf16View.h @@ -148,8 +148,8 @@ public: bool starts_with(Utf16View const&) const; bool is_code_unit_less_than(Utf16View const& other) const; - bool validate() const; - bool validate(size_t& valid_code_units) const; + bool validate(AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; + bool validate(size_t& valid_code_units, AllowInvalidCodeUnits = AllowInvalidCodeUnits::No) const; bool equals_ignoring_case(Utf16View const&) const; diff --git a/Tests/AK/TestUtf16View.cpp b/Tests/AK/TestUtf16View.cpp index 852fbc42724..90247c876bf 100644 --- a/Tests/AK/TestUtf16View.cpp +++ b/Tests/AK/TestUtf16View.cpp @@ -187,55 +187,86 @@ TEST_CASE(iterate_utf16) TEST_CASE(validate_invalid_utf16) { size_t valid_code_units = 0; + Utf16View invalid; { // Lonely high surrogate. - auto invalid = Array { (u16)0xd800 }; - EXPECT(!Utf16View(invalid).validate(valid_code_units)); - EXPECT(valid_code_units == 0); + invalid = u"\xd800"; + EXPECT(!invalid.validate(valid_code_units)); + EXPECT_EQ(valid_code_units, 0uz); - invalid = Array { (u16)0xdbff }; - EXPECT(!Utf16View(invalid).validate(valid_code_units)); - EXPECT(valid_code_units == 0); + EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); + EXPECT_EQ(valid_code_units, 1uz); + + invalid = u"\xdbff"; + EXPECT(!invalid.validate(valid_code_units)); + EXPECT_EQ(valid_code_units, 0uz); + + EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); + EXPECT_EQ(valid_code_units, 1uz); } { // Lonely low surrogate. - auto invalid = Array { (u16)0xdc00 }; - EXPECT(!Utf16View(invalid).validate(valid_code_units)); - EXPECT(valid_code_units == 0); + invalid = u"\xdc00"; + EXPECT(!invalid.validate(valid_code_units)); + EXPECT_EQ(valid_code_units, 0uz); - invalid = Array { (u16)0xdfff }; - EXPECT(!Utf16View(invalid).validate(valid_code_units)); - EXPECT(valid_code_units == 0); + EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); + EXPECT_EQ(valid_code_units, 1uz); + + invalid = u"\xdfff"; + EXPECT(!invalid.validate(valid_code_units)); + EXPECT_EQ(valid_code_units, 0uz); + + EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); + EXPECT_EQ(valid_code_units, 1uz); } { // High surrogate followed by non-surrogate. - auto invalid = Array { (u16)0xd800, 0 }; - EXPECT(!Utf16View(invalid).validate(valid_code_units)); - EXPECT(valid_code_units == 0); + invalid = u"\xd800\x0000"; + EXPECT(!invalid.validate(valid_code_units)); + EXPECT_EQ(valid_code_units, 0uz); - invalid = Array { (u16)0xd800, 0xe000 }; - EXPECT(!Utf16View(invalid).validate(valid_code_units)); - EXPECT(valid_code_units == 0); + EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); + EXPECT_EQ(valid_code_units, 2uz); + + invalid = u"\xd800\xe000"; + EXPECT(!invalid.validate(valid_code_units)); + EXPECT_EQ(valid_code_units, 0uz); + + EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); + EXPECT_EQ(valid_code_units, 2uz); } { // High surrogate followed by high surrogate. - auto invalid = Array { (u16)0xd800, 0xd800 }; - EXPECT(!Utf16View(invalid).validate(valid_code_units)); - EXPECT(valid_code_units == 0); + invalid = u"\xd800\xd800"; + EXPECT(!invalid.validate(valid_code_units)); + EXPECT_EQ(valid_code_units, 0uz); - invalid = Array { (u16)0xd800, 0xdbff }; - EXPECT(!Utf16View(invalid).validate(valid_code_units)); - EXPECT(valid_code_units == 0); + EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); + EXPECT_EQ(valid_code_units, 2uz); + + invalid = u"\xd800\xdbff"; + EXPECT(!invalid.validate(valid_code_units)); + EXPECT_EQ(valid_code_units, 0uz); + + EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); + EXPECT_EQ(valid_code_units, 2uz); } { // Valid UTF-16 followed by invalid code units. - auto invalid = Array { (u16)0x41, 0x41, 0xd800 }; - EXPECT(!Utf16View(invalid).validate(valid_code_units)); - EXPECT(valid_code_units == 2); + invalid = u"\x0041\x0041\xd800"; + EXPECT(!invalid.validate(valid_code_units)); + EXPECT_EQ(valid_code_units, 2uz); - invalid = Array { (u16)0x41, 0x41, 0xd800 }; - EXPECT(!Utf16View(invalid).validate(valid_code_units)); - EXPECT(valid_code_units == 2); + EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); + EXPECT_EQ(valid_code_units, 3uz); + + invalid = u"\x0041\x0041\xd800"; + EXPECT(!invalid.validate(valid_code_units)); + EXPECT_EQ(valid_code_units, 2uz); + + EXPECT(invalid.validate(valid_code_units, Utf16View::AllowInvalidCodeUnits::Yes)); + EXPECT_EQ(valid_code_units, 3uz); } }