LibWeb: Check charset UTF-16LE/BE separately for UTF-8 conversion

Previously, the charset of name "UTF-16BE/LE" would be checked against
when following standards to convert the charset to UTF-8, but in
reality, the charsets "UTF-16BE" and "UTF-16LE" should be checked
separately.

Co-authored-by: Jelle Raaijmakers <jelle@ladybird.org>
This commit is contained in:
Jaycadox 2025-02-24 10:26:28 +01:00 committed by Jelle Raaijmakers
commit f672c57ca7
Notes: github-actions[bot] 2025-02-24 13:52:43 +00:00
3 changed files with 33 additions and 15 deletions

View file

@ -311,7 +311,8 @@ Optional<ByteString> run_prescan_byte_stream_algorithm(DOM::Document& document,
if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value())
continue;
if (charset.value() == "UTF-16BE/LE")
// https://encoding.spec.whatwg.org/#common-infrastructure-for-utf-16be-and-utf-16le
if (charset.value() == "UTF-16BE" || charset.value() == "UTF-16LE")
return "UTF-8";
else if (charset.value() == "x-user-defined")
return "windows-1252";

View file

@ -1 +1,4 @@
PASS: UTF-8
Encoding: utf-16be mapped to characterSet: UTF-8
Encoding: utf-16le mapped to characterSet: UTF-8
Encoding: utf-8 mapped to characterSet: UTF-8
Encoding: x-user-defined mapped to characterSet: windows-1252

View file

@ -2,23 +2,37 @@
<script src="include.js"></script>
<script>
asyncTest(async (done) => {
const encodings = ['utf-8', 'utf-16be', 'utf-16le', 'x-user-defined'];
let receivedMessages = [];
const dumpMessages = () => {
receivedMessages.sort((a, b) => a.encoding.localeCompare(b.encoding));
for (const receivedMessage of receivedMessages) {
println(`Encoding: ${receivedMessage.encoding} mapped to characterSet: ${receivedMessage.characterSet}`);
}
};
addEventListener("message", (event) => {
receivedMessages.push(event.data);
if (receivedMessages.length == encodings.length) {
dumpMessages();
done();
}
}, false);
const httpServer = httpTestServer();
const url = await httpServer.createEcho("GET", "/document-computed-mimetype-test", {
for (let encoding of encodings) {
const url = await httpServer.createEcho("GET", `/document-computed-mimetype-test-${encoding}`, {
status: 200,
headers: {
"Access-Control-Allow-Origin": "*",
},
body: `<!doctype html><meta charset="UTF-8"><script>parent.postMessage(document.characterSet, "*")<\/script>`,
body: `<!doctype html><meta charset="${encoding}"><script>parent.postMessage({"encoding": "${encoding}", "characterSet": document.characterSet}, "*")<\/script>`,
});
const frame = document.createElement('iframe');
const frame = document.createElement("iframe");
frame.src = url;
addEventListener("message", (event) => {
println("PASS: " + event.data);
done();
}, false);
document.body.appendChild(frame);
}
});
</script>