LibWeb: Implement encoding sniffing algorithm

This patch implements the HTML specification's "encoding sniffing
algorithm", which is used when no encoding can be obtained from the
Content-Type header (either because it doesn't contain a charset=...)
value or the file has not been opened via HTTP (as with local files).

It also modifies the creator of the HTMLDocumentParser to use the new
HTMLDocumentParser::create_with_uncertain_encoding static method, which
runs the encoding sniffing algorithm before instantiating the parser.

This now allows us to load local HTML pages (or remote pages without a
charset specified in the 'Content-Type' header) with a non-UTF-8
encoding such as 'windows-1252'. This would previously crash the
browser. :^)
This commit is contained in:
Max Wipfli 2021-05-12 10:47:12 +02:00 committed by Andreas Kling
parent 67a9ebc817
commit f808279769
Notes: sideshowbarker 2024-07-18 17:48:35 +09:00
6 changed files with 261 additions and 2 deletions

View file

@ -22,6 +22,7 @@
#include <LibWeb/HTML/HTMLTableElement.h>
#include <LibWeb/HTML/HTMLTemplateElement.h>
#include <LibWeb/HTML/Parser/HTMLDocumentParser.h>
#include <LibWeb/HTML/Parser/HTMLEncodingDetection.h>
#include <LibWeb/HTML/Parser/HTMLToken.h>
#include <LibWeb/Namespace.h>
#include <LibWeb/SVG/TagNames.h>
@ -3039,4 +3040,14 @@ NonnullRefPtrVector<DOM::Node> HTMLDocumentParser::parse_html_fragment(DOM::Elem
}
return children;
}
NonnullOwnPtr<HTMLDocumentParser> HTMLDocumentParser::create_with_uncertain_encoding(DOM::Document& document, const ByteBuffer& input)
{
if (document.has_encoding())
return make<HTMLDocumentParser>(document, input, document.encoding().value());
auto encoding = run_encoding_sniffing_algorithm(input);
dbgln("The encoding sniffing algorithm returned encoding '{}'", encoding);
return make<HTMLDocumentParser>(document, input, encoding);
}
}