From f672c57ca7f9d9604bf0654dba28c343a996ecfb Mon Sep 17 00:00:00 2001 From: Jaycadox <68955944+Jaycadox@users.noreply.github.com> Date: Mon, 24 Feb 2025 10:26:28 +0100 Subject: [PATCH] LibWeb: Check charset UTF-16LE/BE separately for UTF-8 conversion Previously, the charset of name "UTF-16BE/LE" would be checked against when following standards to convert the charset to UTF-8, but in reality, the charsets "UTF-16BE" and "UTF-16LE" should be checked separately. Co-authored-by: Jelle Raaijmakers --- .../HTML/Parser/HTMLEncodingDetection.cpp | 3 +- .../expected/document-computed-mimetype.txt | 5 ++- .../input/document-computed-mimetype.html | 40 +++++++++++++------ 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp b/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp index 67568045b6d..4d860159b78 100644 --- a/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp +++ b/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp @@ -311,7 +311,8 @@ Optional run_prescan_byte_stream_algorithm(DOM::Document& document, if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value()) continue; - if (charset.value() == "UTF-16BE/LE") + // https://encoding.spec.whatwg.org/#common-infrastructure-for-utf-16be-and-utf-16le + if (charset.value() == "UTF-16BE" || charset.value() == "UTF-16LE") return "UTF-8"; else if (charset.value() == "x-user-defined") return "windows-1252"; diff --git a/Tests/LibWeb/Text/expected/document-computed-mimetype.txt b/Tests/LibWeb/Text/expected/document-computed-mimetype.txt index e186e01bb9f..c3676e9ea08 100644 --- a/Tests/LibWeb/Text/expected/document-computed-mimetype.txt +++ b/Tests/LibWeb/Text/expected/document-computed-mimetype.txt @@ -1 +1,4 @@ -PASS: UTF-8 +Encoding: utf-16be mapped to characterSet: UTF-8 +Encoding: utf-16le mapped to characterSet: UTF-8 +Encoding: utf-8 mapped to characterSet: UTF-8 +Encoding: x-user-defined mapped to characterSet: windows-1252 diff --git a/Tests/LibWeb/Text/input/document-computed-mimetype.html b/Tests/LibWeb/Text/input/document-computed-mimetype.html index 32e7e12f0a8..78f24a5e80b 100644 --- a/Tests/LibWeb/Text/input/document-computed-mimetype.html +++ b/Tests/LibWeb/Text/input/document-computed-mimetype.html @@ -2,23 +2,37 @@