HTMLEncodingDetection: Use mime type in encoding sniffing

Also added proper spec comments. Fixes at least one WPT test that was failing previously: https://wpt.live/encoding/single-byte-decoder.window.html?document
Author: https://github.com/0x4261756D Commit: c1a14f66ad Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/1707
2025-08-06 16:19:23 +00:00 · 2024-10-10 02:04:07 +02:00 · 2024-10-10 02:04:07 +02:00 · c1a14f66ad · 2024-10-12 14:15:30 +00:00
commit c1a14f66ad
parent dae6200c1d
3 changed files with 67 additions and 28 deletions
--- a/Userland/Libraries/LibWeb/DOM/DocumentLoading.cpp
+++ b/Userland/Libraries/LibWeb/DOM/DocumentLoading.cpp
@ -146,14 +146,14 @@ static WebIDL::ExceptionOr<JS::NonnullGCPtr<DOM::Document>> load_xml_document(HT
    if (auto maybe_encoding = type.parameters().get("charset"sv); maybe_encoding.has_value())
        content_encoding = maybe_encoding.value();

-    auto process_body = JS::create_heap_function(document->heap(), [document, url = navigation_params.response->url().value(), content_encoding = move(content_encoding)](ByteBuffer data) {
+    auto process_body = JS::create_heap_function(document->heap(), [document, url = navigation_params.response->url().value(), content_encoding = move(content_encoding), mime = type](ByteBuffer data) {
        Optional<TextCodec::Decoder&> decoder;
        // The actual HTTP headers and other metadata, not the headers as mutated or implied by the algorithms given in this specification,
        // are the ones that must be used when determining the character encoding according to the rules given in the above specifications.
        if (content_encoding.has_value())
            decoder = TextCodec::decoder_for(*content_encoding);
        if (!decoder.has_value()) {
-            auto encoding = HTML::run_encoding_sniffing_algorithm(document, data);
+            auto encoding = HTML::run_encoding_sniffing_algorithm(document, data, mime);
            decoder = TextCodec::decoder_for(encoding);
        }
        VERIFY(decoder.has_value());
@ -226,8 +226,8 @@ static WebIDL::ExceptionOr<JS::NonnullGCPtr<DOM::Document>> load_text_document(H
    //    document's relevant global object to have the parser to process the implied EOF character, which eventually causes a
    //    load event to be fired.
    // FIXME: Parse as we receive the document data, instead of waiting for the whole document to be fetched first.
-    auto process_body = JS::create_heap_function(document->heap(), [document, url = navigation_params.response->url().value()](ByteBuffer data) {
-        auto encoding = run_encoding_sniffing_algorithm(document, data);
+    auto process_body = JS::create_heap_function(document->heap(), [document, url = navigation_params.response->url().value(), mime = type](ByteBuffer data) {
+        auto encoding = run_encoding_sniffing_algorithm(document, data, mime);
        dbgln_if(HTML_PARSER_DEBUG, "The encoding sniffing algorithm returned encoding '{}'", encoding);

        auto parser = HTML::HTMLParser::create_for_scripting(document);