mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-05-02 17:28:48 +00:00
LibWeb: Implement encoding sniffing algorithm
This patch implements the HTML specification's "encoding sniffing algorithm", which is used when no encoding can be obtained from the Content-Type header (either because it doesn't contain a charset=...) value or the file has not been opened via HTTP (as with local files). It also modifies the creator of the HTMLDocumentParser to use the new HTMLDocumentParser::create_with_uncertain_encoding static method, which runs the encoding sniffing algorithm before instantiating the parser. This now allows us to load local HTML pages (or remote pages without a charset specified in the 'Content-Type' header) with a non-UTF-8 encoding such as 'windows-1252'. This would previously crash the browser. :^)
This commit is contained in:
parent
67a9ebc817
commit
f808279769
Notes:
sideshowbarker
2024-07-18 17:48:35 +09:00
Author: https://github.com/MaxWipfli
Commit: f808279769
Pull-request: https://github.com/SerenityOS/serenity/pull/7055
Issue: https://github.com/SerenityOS/serenity/issues/6910
Reviewed-by: https://github.com/Dexesttp
Reviewed-by: https://github.com/awesomekling
Reviewed-by: https://github.com/linusg
6 changed files with 261 additions and 2 deletions
223
Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp
Normal file
223
Userland/Libraries/LibWeb/HTML/Parser/HTMLEncodingDetection.cpp
Normal file
|
@ -0,0 +1,223 @@
|
|||
/*
|
||||
* Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <AK/StringView.h>
|
||||
#include <AK/Utf8View.h>
|
||||
#include <LibTextCodec/Decoder.h>
|
||||
#include <LibWeb/HTML/Parser/HTMLEncodingDetection.h>
|
||||
#include <ctype.h>
|
||||
|
||||
namespace Web::HTML {
|
||||
|
||||
bool prescan_should_abort(const ByteBuffer& input, const size_t& position)
|
||||
{
|
||||
return position >= input.size() || position >= 1024;
|
||||
}
|
||||
|
||||
bool prescan_is_whitespace_or_slash(const u8& byte)
|
||||
{
|
||||
return byte == '\t' || byte == '\n' || byte == '\f' || byte == '\r' || byte == ' ' || byte == '/';
|
||||
}
|
||||
|
||||
bool prescan_skip_whitespace_and_slashes(const ByteBuffer& input, size_t& position)
|
||||
{
|
||||
while (!prescan_should_abort(input, position) && (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '/'))
|
||||
++position;
|
||||
return !prescan_should_abort(input, position);
|
||||
}
|
||||
|
||||
Optional<Attribute> prescan_get_attribute(const ByteBuffer& input, size_t& position)
|
||||
{
|
||||
if (!prescan_skip_whitespace_and_slashes(input, position))
|
||||
return {};
|
||||
if (input[position] == '>')
|
||||
return {};
|
||||
|
||||
StringBuilder attribute_name;
|
||||
while (true) {
|
||||
if (input[position] == '=' && !attribute_name.is_empty()) {
|
||||
++position;
|
||||
goto value;
|
||||
} else if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ')
|
||||
goto spaces;
|
||||
else if (input[position] == '/' || input[position] == '>')
|
||||
return Attribute(attribute_name.to_string(), "");
|
||||
else
|
||||
attribute_name.append_as_lowercase(input[position]);
|
||||
++position;
|
||||
if (prescan_should_abort(input, position))
|
||||
return {};
|
||||
}
|
||||
|
||||
spaces:
|
||||
if (!prescan_skip_whitespace_and_slashes(input, position))
|
||||
return {};
|
||||
if (input[position] != '=')
|
||||
return Attribute(attribute_name.to_string(), "");
|
||||
++position;
|
||||
|
||||
value:
|
||||
if (!prescan_skip_whitespace_and_slashes(input, position))
|
||||
return {};
|
||||
|
||||
StringBuilder attribute_value;
|
||||
if (input[position] == '"' || input[position] == '\'') {
|
||||
u8 quote_character = input[position];
|
||||
++position;
|
||||
for (; !prescan_should_abort(input, position); ++position) {
|
||||
if (input[position] == quote_character)
|
||||
return Attribute(attribute_name.to_string(), attribute_value.to_string());
|
||||
else
|
||||
attribute_value.append_as_lowercase(input[position]);
|
||||
}
|
||||
return {};
|
||||
} else if (input[position] == '>')
|
||||
return Attribute(attribute_name.to_string(), "");
|
||||
else
|
||||
attribute_value.append_as_lowercase(input[position]);
|
||||
|
||||
++position;
|
||||
if (prescan_should_abort(input, position))
|
||||
return {};
|
||||
|
||||
for (; !prescan_should_abort(input, position); ++position) {
|
||||
if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '>')
|
||||
return Attribute(attribute_name.to_string(), attribute_value.to_string());
|
||||
else
|
||||
attribute_value.append_as_lowercase(input[position]);
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
// https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
|
||||
Optional<String> run_prescan_byte_stream_algorithm(const ByteBuffer& input)
|
||||
{
|
||||
// https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
|
||||
|
||||
// Detects '<?x'
|
||||
if (!prescan_should_abort(input, 6)) {
|
||||
if (input[0] == 0x3C && input[1] == 0x00 && input[2] == 0x3F && input[3] == 0x00 && input[4] == 0x78 && input[5] == 0x00)
|
||||
return "utf-16le";
|
||||
if (input[0] == 0x00 && input[1] == 0x3C && input[2] == 0x00 && input[4] == 0x3F && input[5] == 0x00 && input[6] == 0x78)
|
||||
return "utf-16be";
|
||||
}
|
||||
|
||||
for (size_t position = 0; !prescan_should_abort(input, position); ++position) {
|
||||
if (!prescan_should_abort(input, position + 5) && input[position] == '<' && input[position + 1] == '!'
|
||||
&& input[position + 2] == '-' && input[position + 3] == '-') {
|
||||
position += 2;
|
||||
for (; !prescan_should_abort(input, position + 3); ++position) {
|
||||
if (input[position] == '-' && input[position + 1] == '-' && input[position + 2] == '>') {
|
||||
position += 2;
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else if (!prescan_should_abort(input, position + 6)
|
||||
&& input[position] == '<'
|
||||
&& (input[position + 1] == 'M' || input[position + 1] == 'm')
|
||||
&& (input[position + 2] == 'E' || input[position + 2] == 'e')
|
||||
&& (input[position + 3] == 'T' || input[position + 3] == 't')
|
||||
&& (input[position + 4] == 'A' || input[position + 4] == 'a')
|
||||
&& prescan_is_whitespace_or_slash(input[position + 5])) {
|
||||
position += 6;
|
||||
Vector<String> attribute_list {};
|
||||
bool got_pragma = false;
|
||||
Optional<bool> need_pragma {};
|
||||
Optional<String> charset {};
|
||||
|
||||
while (true) {
|
||||
auto attribute = prescan_get_attribute(input, position);
|
||||
if (!attribute.has_value())
|
||||
break;
|
||||
if (attribute_list.contains_slow(attribute.value().name()))
|
||||
continue;
|
||||
auto& attribute_name = attribute.value().name();
|
||||
attribute_list.append(attribute.value().name());
|
||||
|
||||
if (attribute_name == "http-equiv" && attribute.value().value() == "content-type")
|
||||
got_pragma = true;
|
||||
else if (attribute_name == "charset") {
|
||||
auto maybe_charset = TextCodec::get_standardized_encoding(attribute.value().value());
|
||||
if (maybe_charset.has_value()) {
|
||||
charset = Optional<String> { maybe_charset };
|
||||
need_pragma = { false };
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: For attribute name "content", do this:
|
||||
// Apply the "algorithm for extracting a character encoding from a meta
|
||||
// element", giving the attribute's value as the string to parse. If a
|
||||
// character encoding is returned, and if charset is still set to null,
|
||||
// let charset be the encoding returned, and set need pragma to true.
|
||||
}
|
||||
|
||||
if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value())
|
||||
continue;
|
||||
if (charset.value() == "UTF-16BE/LE")
|
||||
return "UTF-8";
|
||||
else if (charset.value() == "x-user-defined")
|
||||
return "windows-1252";
|
||||
else
|
||||
return charset.value();
|
||||
} else if (!prescan_should_abort(input, position + 3) && input[position] == '<'
|
||||
&& ((input[position + 1] == '/' && isalpha(input[position + 2])) || isalpha(input[position + 1]))) {
|
||||
position += 2;
|
||||
prescan_skip_whitespace_and_slashes(input, position);
|
||||
while (prescan_get_attribute(input, position).has_value()) { };
|
||||
} else if (!prescan_should_abort(input, position + 1) && input[position] == '<' && (input[position + 1] == '!' || input[position + 1] == '/' || input[position + 1] == '?')) {
|
||||
position += 2;
|
||||
while (input[position] != '>') {
|
||||
++position;
|
||||
if (prescan_should_abort(input, position))
|
||||
return {};
|
||||
}
|
||||
} else {
|
||||
// Do nothing.
|
||||
}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
|
||||
String run_encoding_sniffing_algorithm(const ByteBuffer& input)
|
||||
{
|
||||
if (input.size() >= 2) {
|
||||
if (input[0] == 0xFE && input[1] == 0xFF) {
|
||||
return "UTF-16BE";
|
||||
} else if (input[0] == 0xFF && input[1] == 0xFE) {
|
||||
return "UTF-16LE";
|
||||
} else if (input.size() >= 3 && input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) {
|
||||
return "UTF-8";
|
||||
}
|
||||
}
|
||||
|
||||
// FIXME: If the user has explicitly instructed the user agent to override the document's character
|
||||
// encoding with a specific encoding.
|
||||
// FIXME: The user agent may wait for more bytes of the resource to be available, either in this step or
|
||||
// at any later step in this algorithm.
|
||||
// FIXME: If the transport layer specifies a character encoding, and it is supported.
|
||||
|
||||
auto optional_encoding = run_prescan_byte_stream_algorithm(input);
|
||||
if (optional_encoding.has_value()) {
|
||||
return optional_encoding.value();
|
||||
}
|
||||
|
||||
// FIXME: If the HTML parser for which this algorithm is being run is associated with a Document whose browsing context
|
||||
// is non-null and a child browsing context.
|
||||
// FIXME: If the user agent has information on the likely encoding for this page, e.g. based on the encoding of the page
|
||||
// when it was last visited.
|
||||
|
||||
if (!Utf8View(StringView(input)).validate()) {
|
||||
// FIXME: As soon as Locale is supported, this should sometimes return a different encoding based on the locale.
|
||||
return "windows-1252";
|
||||
}
|
||||
|
||||
// NOTE: This is the authoritative place to actually decide on using the default encoding as per the HTML specification.
|
||||
// "Otherwise, return an implementation-defined or user-specified default character encoding, [...]."
|
||||
return "UTF-8";
|
||||
}
|
||||
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue