diff --git a/AK/String.cpp b/AK/String.cpp index 1ae9923ecab..00162a8aa37 100644 --- a/AK/String.cpp +++ b/AK/String.cpp @@ -27,13 +27,19 @@ String String::from_utf8_with_replacement_character(StringView view, WithBOMHand if (auto bytes = view.bytes(); with_bom_handling == WithBOMHandling::Yes && bytes.starts_with({ { 0xEF, 0xBB, 0xBF } })) view = view.substring_view(3); - if (Utf8View(view).validate()) + Utf8View utf8_view { view }; + + if (utf8_view.validate(AllowLonelySurrogates::No)) return String::from_utf8_without_validation(view.bytes()); - StringBuilder builder; + StringBuilder builder(view.length()); - for (auto c : Utf8View { view }) - builder.append_code_point(c); + for (auto code_point : utf8_view) { + if (is_unicode_surrogate(code_point)) + builder.append_code_point(UnicodeUtils::REPLACEMENT_CODE_POINT); + else + builder.append_code_point(code_point); + } return builder.to_string_without_validation(); } diff --git a/Tests/AK/TestString.cpp b/Tests/AK/TestString.cpp index 010a36c5308..8709531ec74 100644 --- a/Tests/AK/TestString.cpp +++ b/Tests/AK/TestString.cpp @@ -190,6 +190,9 @@ TEST_CASE(with_replacement_character) auto string6 = String::from_utf8_with_replacement_character("\xEF\xBB\xBFWHF!"sv, String::WithBOMHandling::No); EXPECT_EQ(string6, "\xEF\xBB\xBFWHF!"sv); + + auto string7 = String::from_utf8_with_replacement_character("\xED\xA0\x80WHF!"sv); // U+D800 + EXPECT_EQ(string7, "\ufffdWHF!"sv); } TEST_CASE(from_code_points) diff --git a/Tests/LibWeb/Text/expected/Encoding/TextDecoder-decode.txt b/Tests/LibWeb/Text/expected/Encoding/TextDecoder-decode.txt index 8fc16ea51f2..3c19fc75736 100644 --- a/Tests/LibWeb/Text/expected/Encoding/TextDecoder-decode.txt +++ b/Tests/LibWeb/Text/expected/Encoding/TextDecoder-decode.txt @@ -1,2 +1,3 @@ [ABC] [] +[fffd] diff --git a/Tests/LibWeb/Text/input/Encoding/TextDecoder-decode.html b/Tests/LibWeb/Text/input/Encoding/TextDecoder-decode.html index f394c6ab3bf..1f45ff6ca6c 100644 --- a/Tests/LibWeb/Text/input/Encoding/TextDecoder-decode.html +++ b/Tests/LibWeb/Text/input/Encoding/TextDecoder-decode.html @@ -6,7 +6,10 @@ let decoder = new TextDecoder("utf-8"); println(`[${decoder.decode(new Uint8Array([0x41, 0x42, 0x43]))}]`); // "ABC" println(`[${decoder.decode()}]`); - } catch(e) { + + const surrogate = decoder.decode(new Uint8Array([0xed, 0xa0, 0x80])); // U+D800 + println(`[${surrogate.codePointAt(0).toString(16)}]`); + } catch (e) { println("ERROR: " + e.name + ": " + e.message); } });