diff --git a/AK/Utf16View.cpp b/AK/Utf16View.cpp index 1a5c019fb45..4150f57ed7e 100644 --- a/AK/Utf16View.cpp +++ b/AK/Utf16View.cpp @@ -105,11 +105,12 @@ ErrorOr Utf16View::to_utf8(AllowInvalidCodeUnits allow_invalid_code_unit TRY(builder.try_append_code_point(static_cast(*ptr))); } - } else { - for (auto code_point : *this) - TRY(builder.try_append_code_point(code_point)); + return builder.to_string_without_validation(); } + for (auto code_point : *this) + TRY(builder.try_append_code_point(code_point)); + return builder.to_string(); } diff --git a/AK/Utf8View.h b/AK/Utf8View.h index 0e1184401fb..e1b1ae70ccf 100644 --- a/AK/Utf8View.h +++ b/AK/Utf8View.h @@ -80,6 +80,11 @@ public: explicit Utf8View(ByteString&&) = delete; #endif + enum class AllowSurrogates { + Yes, + No, + }; + ~Utf8View() = default; StringView as_string() const { return m_string; } @@ -121,13 +126,13 @@ public: return m_length; } - constexpr bool validate() const + constexpr bool validate(AllowSurrogates surrogates = AllowSurrogates::Yes) const { size_t valid_bytes = 0; - return validate(valid_bytes); + return validate(valid_bytes, surrogates); } - constexpr bool validate(size_t& valid_bytes) const + constexpr bool validate(size_t& valid_bytes, AllowSurrogates surrogates = AllowSurrogates::Yes) const { valid_bytes = 0; @@ -148,7 +153,7 @@ public: code_point |= code_point_bits; } - if (!is_valid_code_point(code_point, byte_length)) + if (!is_valid_code_point(code_point, byte_length, surrogates)) return false; valid_bytes += byte_length; @@ -216,8 +221,10 @@ private: return { .is_valid = false }; } - static constexpr bool is_valid_code_point(u32 code_point, size_t byte_length) + static constexpr bool is_valid_code_point(u32 code_point, size_t byte_length, AllowSurrogates surrogates = AllowSurrogates::Yes) { + if (surrogates == AllowSurrogates::No && byte_length == 3 && code_point >= 0xD800 && code_point <= 0xDFFF) + return false; for (auto const& data : utf8_encoded_byte_data) { if (code_point >= data.first_code_point && code_point <= data.last_code_point) return byte_length == data.byte_length; diff --git a/Tests/AK/TestUtf8.cpp b/Tests/AK/TestUtf8.cpp index bc0d5b24ee3..726d79dca08 100644 --- a/Tests/AK/TestUtf8.cpp +++ b/Tests/AK/TestUtf8.cpp @@ -80,6 +80,11 @@ TEST_CASE(validate_invalid_ut8) Utf8View utf8_6 { StringView { invalid_utf8_6, 4 } }; EXPECT(!utf8_6.validate(valid_bytes)); EXPECT(valid_bytes == 0); + + char invalid_utf8_7[] = { (char)0xed, (char)0xa0, (char)0x80 }; // U+d800 + Utf8View utf8_7 { StringView { invalid_utf8_7, 3 } }; + EXPECT(!utf8_7.validate(valid_bytes, Utf8View::AllowSurrogates::No)); + EXPECT(valid_bytes == 0); } TEST_CASE(validate_overlong_utf8)