mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-08-22 10:19:20 +00:00
AK: Replace surrogates in String::from_utf8_with_replacement_character
Some checks are pending
CI / macOS, arm64, Sanitizer_CI, Clang (push) Waiting to run
CI / Linux, x86_64, Fuzzers_CI, Clang (push) Waiting to run
CI / Linux, x86_64, Sanitizer_CI, GNU (push) Waiting to run
CI / Linux, x86_64, Sanitizer_CI, Clang (push) Waiting to run
Package the js repl as a binary artifact / Linux, arm64 (push) Waiting to run
Package the js repl as a binary artifact / macOS, arm64 (push) Waiting to run
Package the js repl as a binary artifact / Linux, x86_64 (push) Waiting to run
Run test262 and test-wasm / run_and_update_results (push) Waiting to run
Lint Code / lint (push) Waiting to run
Label PRs with merge conflicts / auto-labeler (push) Waiting to run
Push notes / build (push) Waiting to run
Some checks are pending
CI / macOS, arm64, Sanitizer_CI, Clang (push) Waiting to run
CI / Linux, x86_64, Fuzzers_CI, Clang (push) Waiting to run
CI / Linux, x86_64, Sanitizer_CI, GNU (push) Waiting to run
CI / Linux, x86_64, Sanitizer_CI, Clang (push) Waiting to run
Package the js repl as a binary artifact / Linux, arm64 (push) Waiting to run
Package the js repl as a binary artifact / macOS, arm64 (push) Waiting to run
Package the js repl as a binary artifact / Linux, x86_64 (push) Waiting to run
Run test262 and test-wasm / run_and_update_results (push) Waiting to run
Lint Code / lint (push) Waiting to run
Label PRs with merge conflicts / auto-labeler (push) Waiting to run
Push notes / build (push) Waiting to run
We are expected to replace lonely surrogates with U+FFFD when decoding UTF-8 text.
This commit is contained in:
parent
51afbf5280
commit
01ebf1eb07
Notes:
github-actions[bot]
2025-07-05 16:31:19 +00:00
Author: https://github.com/trflynn89
Commit: 01ebf1eb07
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5313
Reviewed-by: https://github.com/shannonbooth ✅
4 changed files with 18 additions and 5 deletions
|
@ -27,13 +27,19 @@ String String::from_utf8_with_replacement_character(StringView view, WithBOMHand
|
||||||
if (auto bytes = view.bytes(); with_bom_handling == WithBOMHandling::Yes && bytes.starts_with({ { 0xEF, 0xBB, 0xBF } }))
|
if (auto bytes = view.bytes(); with_bom_handling == WithBOMHandling::Yes && bytes.starts_with({ { 0xEF, 0xBB, 0xBF } }))
|
||||||
view = view.substring_view(3);
|
view = view.substring_view(3);
|
||||||
|
|
||||||
if (Utf8View(view).validate())
|
Utf8View utf8_view { view };
|
||||||
|
|
||||||
|
if (utf8_view.validate(AllowLonelySurrogates::No))
|
||||||
return String::from_utf8_without_validation(view.bytes());
|
return String::from_utf8_without_validation(view.bytes());
|
||||||
|
|
||||||
StringBuilder builder;
|
StringBuilder builder(view.length());
|
||||||
|
|
||||||
for (auto c : Utf8View { view })
|
for (auto code_point : utf8_view) {
|
||||||
builder.append_code_point(c);
|
if (is_unicode_surrogate(code_point))
|
||||||
|
builder.append_code_point(UnicodeUtils::REPLACEMENT_CODE_POINT);
|
||||||
|
else
|
||||||
|
builder.append_code_point(code_point);
|
||||||
|
}
|
||||||
|
|
||||||
return builder.to_string_without_validation();
|
return builder.to_string_without_validation();
|
||||||
}
|
}
|
||||||
|
|
|
@ -190,6 +190,9 @@ TEST_CASE(with_replacement_character)
|
||||||
|
|
||||||
auto string6 = String::from_utf8_with_replacement_character("\xEF\xBB\xBFWHF!"sv, String::WithBOMHandling::No);
|
auto string6 = String::from_utf8_with_replacement_character("\xEF\xBB\xBFWHF!"sv, String::WithBOMHandling::No);
|
||||||
EXPECT_EQ(string6, "\xEF\xBB\xBFWHF!"sv);
|
EXPECT_EQ(string6, "\xEF\xBB\xBFWHF!"sv);
|
||||||
|
|
||||||
|
auto string7 = String::from_utf8_with_replacement_character("\xED\xA0\x80WHF!"sv); // U+D800
|
||||||
|
EXPECT_EQ(string7, "\ufffdWHF!"sv);
|
||||||
}
|
}
|
||||||
|
|
||||||
TEST_CASE(from_code_points)
|
TEST_CASE(from_code_points)
|
||||||
|
|
|
@ -1,2 +1,3 @@
|
||||||
[ABC]
|
[ABC]
|
||||||
[]
|
[]
|
||||||
|
[fffd]
|
||||||
|
|
|
@ -6,6 +6,9 @@
|
||||||
let decoder = new TextDecoder("utf-8");
|
let decoder = new TextDecoder("utf-8");
|
||||||
println(`[${decoder.decode(new Uint8Array([0x41, 0x42, 0x43]))}]`); // "ABC"
|
println(`[${decoder.decode(new Uint8Array([0x41, 0x42, 0x43]))}]`); // "ABC"
|
||||||
println(`[${decoder.decode()}]`);
|
println(`[${decoder.decode()}]`);
|
||||||
|
|
||||||
|
const surrogate = decoder.decode(new Uint8Array([0xed, 0xa0, 0x80])); // U+D800
|
||||||
|
println(`[${surrogate.codePointAt(0).toString(16)}]`);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
println("ERROR: " + e.name + ": " + e.message);
|
println("ERROR: " + e.name + ": " + e.message);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue