Everywhere: Hoist the Libraries folder to the top-level

This commit is contained in:
Timothy Flynn 2024-11-09 12:25:08 -05:00 committed by Andreas Kling
commit 93712b24bf
Notes: github-actions[bot] 2024-11-10 11:51:52 +00:00
4547 changed files with 104 additions and 113 deletions

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,21 @@
/*
* Copyright (c) 2020, Andreas Kling <andreas@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/StringView.h>
#include <AK/Vector.h>
namespace Web::HTML {
struct EntityMatch {
Vector<u32, 2> code_points;
StringView entity;
};
Optional<EntityMatch> code_points_from_entity(StringView);
}

View file

@ -0,0 +1,416 @@
/*
* Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/CharacterTypes.h>
#include <AK/GenericLexer.h>
#include <AK/StringView.h>
#include <AK/Utf8View.h>
#include <LibTextCodec/Decoder.h>
#include <LibWeb/DOM/Attr.h>
#include <LibWeb/DOM/Document.h>
#include <LibWeb/HTML/Parser/HTMLEncodingDetection.h>
#include <LibWeb/Infra/CharacterTypes.h>
#include <ctype.h>
namespace Web::HTML {
bool prescan_should_abort(ByteBuffer const& input, size_t const& position)
{
return position >= input.size() || position >= 1024;
}
bool prescan_is_whitespace_or_slash(u8 const& byte)
{
return byte == '\t' || byte == '\n' || byte == '\f' || byte == '\r' || byte == ' ' || byte == '/';
}
bool prescan_skip_whitespace_and_slashes(ByteBuffer const& input, size_t& position)
{
while (!prescan_should_abort(input, position) && (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '/'))
++position;
return !prescan_should_abort(input, position);
}
// https://html.spec.whatwg.org/multipage/urls-and-fetching.html#algorithm-for-extracting-a-character-encoding-from-a-meta-element
Optional<StringView> extract_character_encoding_from_meta_element(ByteString const& string)
{
// Checking for "charset" is case insensitive, as is getting an encoding.
// Therefore, stick to lowercase from the start for simplicity.
auto lowercase_string = string.to_lowercase();
GenericLexer lexer(lowercase_string);
for (;;) {
auto charset_index = lexer.remaining().find("charset"sv);
if (!charset_index.has_value())
return {};
// 7 is the length of "charset".
lexer.ignore(charset_index.value() + 7);
lexer.ignore_while([](char c) {
return Infra::is_ascii_whitespace(c);
});
if (lexer.peek() != '=')
continue;
break;
}
// Ignore the '='.
lexer.ignore();
lexer.ignore_while([](char c) {
return Infra::is_ascii_whitespace(c);
});
if (lexer.is_eof())
return {};
if (lexer.consume_specific('"')) {
auto matching_double_quote = lexer.remaining().find('"');
if (!matching_double_quote.has_value())
return {};
auto encoding = lexer.remaining().substring_view(0, matching_double_quote.value());
return TextCodec::get_standardized_encoding(encoding);
}
if (lexer.consume_specific('\'')) {
auto matching_single_quote = lexer.remaining().find('\'');
if (!matching_single_quote.has_value())
return {};
auto encoding = lexer.remaining().substring_view(0, matching_single_quote.value());
return TextCodec::get_standardized_encoding(encoding);
}
auto encoding = lexer.consume_until([](char c) {
return Infra::is_ascii_whitespace(c) || c == ';';
});
return TextCodec::get_standardized_encoding(encoding);
}
// https://html.spec.whatwg.org/multipage/parsing.html#concept-get-attributes-when-sniffing
JS::GCPtr<DOM::Attr> prescan_get_attribute(DOM::Document& document, ByteBuffer const& input, size_t& position)
{
// 1. If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x2F (/) then advance position to the next byte and redo this step.
if (!prescan_skip_whitespace_and_slashes(input, position))
return {};
// 2. If the byte at position is 0x3E (>), then abort the get an attribute algorithm. There isn't one.
if (input[position] == '>')
return {};
// 3. Otherwise, the byte at position is the start of the attribute name. Let attribute name and attribute value be the empty string.
// 4. Process the byte at position as follows:
StringBuilder attribute_name;
while (true) {
// -> If it is 0x3D (=), and the attribute name is longer than the empty string
if (input[position] == '=' && !attribute_name.is_empty()) {
// Advance position to the next byte and jump to the step below labeled value.
++position;
goto value;
}
// -> If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP)
if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ') {
// Jump to the step below labeled spaces.
goto spaces;
}
// -> If it is 0x2F (/) or 0x3E (>)
if (input[position] == '/' || input[position] == '>') {
// Abort the get an attribute algorithm. The attribute's name is the value of attribute name, its value is the empty string.
return DOM::Attr::create(document, MUST(attribute_name.to_string()), String {});
}
// -> If it is in the range 0x41 (A) to 0x5A (Z)
if (input[position] >= 'A' && input[position] <= 'Z') {
// Append the code point b+0x20 to attribute name (where b is the value of the byte at position). (This converts the input to lowercase.)
attribute_name.append_code_point(input[position] + 0x20);
}
// -> Anything else
else {
// Append the code point with the same value as the byte at position to attribute name.
// (It doesn't actually matter how bytes outside the ASCII range are handled here,
// since only ASCII bytes can contribute to the detection of a character encoding.)
attribute_name.append_code_point(input[position]);
}
// 5. Advance position to the next byte and return to the previous step.
++position;
if (prescan_should_abort(input, position))
return {};
}
spaces:
// 6. Spaces: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP)
// then advance position to the next byte, then, repeat this step.
if (!prescan_skip_whitespace_and_slashes(input, position))
return {};
// 7. If the byte at position is not 0x3D (=), abort the get an attribute algorithm.
// The attribute's name is the value of attribute name, its value is the empty string.
if (input[position] != '=')
return DOM::Attr::create(document, MUST(attribute_name.to_string()), String {});
// 8. Advance position past the 0x3D (=) byte.
++position;
value:
// 9. Value: If the byte at position is one of 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), or 0x20 (SP)
// then advance position to the next byte, then, repeat this step.
if (!prescan_skip_whitespace_and_slashes(input, position))
return {};
StringBuilder attribute_value;
// 10. Process the byte at position as follows:
// -> If it is 0x22 (") or 0x27 (')
if (input[position] == '"' || input[position] == '\'') {
// 1. Let b be the value of the byte at position.
u8 quote_character = input[position];
// 2. Quote loop: Advance position to the next byte.
++position;
for (; !prescan_should_abort(input, position); ++position) {
// 3. If the value of the byte at position is the value of b, then advance position to the next byte
// and abort the "get an attribute" algorithm.
// The attribute's name is the value of attribute name, and its value is the value of attribute value.
if (input[position] == quote_character)
return DOM::Attr::create(document, MUST(attribute_name.to_string()), MUST(attribute_value.to_string()));
// 4. Otherwise, if the value of the byte at position is in the range 0x41 (A) to 0x5A (Z),
// then append a code point to attribute value whose value is 0x20 more than the value of the byte at position.
if (input[position] >= 'A' && input[position] <= 'Z') {
attribute_value.append_code_point(input[position] + 0x20);
}
// 5. Otherwise, append a code point to attribute value whose value is the same as the value of the byte at position.
else {
attribute_value.append_code_point(input[position]);
}
// 6. Return to the step above labeled quote loop.
}
return {};
}
// -> If it is 0x3E (>)
if (input[position] == '>') {
// Abort the get an attribute algorithm. The attribute's name is the value of attribute name, its value is the empty string.
return DOM::Attr::create(document, MUST(attribute_name.to_string()), String {});
}
// -> If it is in the range 0x41 (A) to 0x5A (Z)
if (input[position] >= 'A' && input[position] <= 'Z') {
// Append a code point b+0x20 to attribute value (where b is the value of the byte at position).
attribute_value.append_code_point(input[position] + 0x20);
// Advance position to the next byte.
++position;
}
// -> Anything else
else {
// Append a code point with the same value as the byte at position to attribute value.
attribute_value.append_code_point(input[position]);
// Advance position to the next byte.
++position;
}
if (prescan_should_abort(input, position))
return {};
// 11. Process the byte at position as follows:
for (; !prescan_should_abort(input, position); ++position) {
// -> If it is 0x09 (HT), 0x0A (LF), 0x0C (FF), 0x0D (CR), 0x20 (SP), or 0x3E (>)
if (input[position] == '\t' || input[position] == '\n' || input[position] == '\f' || input[position] == '\r' || input[position] == ' ' || input[position] == '>') {
// Abort the get an attribute algorithm. The attribute's name is the value of attribute name and its value is the value of attribute value.
return DOM::Attr::create(document, MUST(attribute_name.to_string()), MUST(attribute_value.to_string()));
}
// -> If it is in the range 0x41 (A) to 0x5A (Z)
if (input[position] >= 'A' && input[position] <= 'Z') {
// Append a code point b+0x20 to attribute value (where b is the value of the byte at position).
attribute_value.append_code_point(input[position] + 0x20);
}
// -> Anything else
else {
// Append a code point with the same value as the byte at position to attribute value.
attribute_value.append_code_point(input[position]);
}
// 12. Advance position to the next byte and return to the previous step.
}
return {};
}
// https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
Optional<ByteString> run_prescan_byte_stream_algorithm(DOM::Document& document, ByteBuffer const& input)
{
// https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
// Detects '<?x'
if (!prescan_should_abort(input, 5)) {
// A sequence of bytes starting with: 0x3C, 0x0, 0x3F, 0x0, 0x78, 0x0
if (input[0] == 0x3C && input[1] == 0x00 && input[2] == 0x3F && input[3] == 0x00 && input[4] == 0x78 && input[5] == 0x00)
return "utf-16le";
// A sequence of bytes starting with: 0x0, 0x3C, 0x0, 0x3F, 0x0, 0x78
if (input[0] == 0x00 && input[1] == 0x3C && input[2] == 0x00 && input[3] == 0x3F && input[4] == 0x00 && input[5] == 0x78)
return "utf-16be";
}
for (size_t position = 0; !prescan_should_abort(input, position); ++position) {
if (!prescan_should_abort(input, position + 5) && input[position] == '<' && input[position + 1] == '!'
&& input[position + 2] == '-' && input[position + 3] == '-') {
position += 2;
for (; !prescan_should_abort(input, position + 3); ++position) {
if (input[position] == '-' && input[position + 1] == '-' && input[position + 2] == '>') {
position += 2;
break;
}
}
} else if (!prescan_should_abort(input, position + 6)
&& input[position] == '<'
&& (input[position + 1] == 'M' || input[position + 1] == 'm')
&& (input[position + 2] == 'E' || input[position + 2] == 'e')
&& (input[position + 3] == 'T' || input[position + 3] == 't')
&& (input[position + 4] == 'A' || input[position + 4] == 'a')
&& prescan_is_whitespace_or_slash(input[position + 5])) {
position += 6;
Vector<FlyString> attribute_list {};
bool got_pragma = false;
Optional<bool> need_pragma {};
Optional<ByteString> charset {};
while (true) {
auto attribute = prescan_get_attribute(document, input, position);
if (!attribute)
break;
if (attribute_list.contains_slow(attribute->name()))
continue;
auto const& attribute_name = attribute->name();
attribute_list.append(attribute->name());
if (attribute_name == "http-equiv") {
got_pragma = attribute->value() == "content-type";
} else if (attribute_name == "content") {
auto encoding = extract_character_encoding_from_meta_element(attribute->value().to_byte_string());
if (encoding.has_value() && !charset.has_value()) {
charset = encoding.value();
need_pragma = true;
}
} else if (attribute_name == "charset") {
auto maybe_charset = TextCodec::get_standardized_encoding(attribute->value());
if (maybe_charset.has_value()) {
charset = Optional<ByteString> { maybe_charset };
need_pragma = { false };
}
}
}
if (!need_pragma.has_value() || (need_pragma.value() && !got_pragma) || !charset.has_value())
continue;
if (charset.value() == "UTF-16BE/LE")
return "UTF-8";
else if (charset.value() == "x-user-defined")
return "windows-1252";
else
return charset.value();
} else if (!prescan_should_abort(input, position + 3) && input[position] == '<'
&& ((input[position + 1] == '/' && isalpha(input[position + 2])) || isalpha(input[position + 1]))) {
position += 2;
prescan_skip_whitespace_and_slashes(input, position);
while (prescan_get_attribute(document, input, position)) { };
} else if (!prescan_should_abort(input, position + 1) && input[position] == '<' && (input[position + 1] == '!' || input[position + 1] == '/' || input[position + 1] == '?')) {
position += 1;
do {
position += 1;
if (prescan_should_abort(input, position))
return {};
} while (input[position] != '>');
} else {
// Do nothing.
}
}
return {};
}
// https://encoding.spec.whatwg.org/#bom-sniff
Optional<ByteString> run_bom_sniff(ByteBuffer const& input)
{
if (input.size() >= 3) {
// 1. Let BOM be the result of peeking 3 bytes from ioQueue, converted to a byte sequence.
// 2. For each of the rows in the table below, starting with the first one and going down, if BOM starts with the bytes given in the first column, then return the encoding given in the cell in the second column of that row. Otherwise, return null.
// Byte order mark Encoding
// 0xEF 0xBB 0xBF UTF-8
// 0xFE 0xFF UTF-16BE
// 0xFF 0xFE UTF-16LE
if (input[0] == 0xEF && input[1] == 0xBB && input[2] == 0xBF) {
return "UTF-8";
}
if (input[0] == 0xFE && input[1] == 0xFF) {
return "UTF-16BE";
}
if (input[0] == 0xFF && input[1] == 0xFE) {
return "UTF-16LE";
}
}
return {};
}
// https://html.spec.whatwg.org/multipage/parsing.html#determining-the-character-encoding
ByteString run_encoding_sniffing_algorithm(DOM::Document& document, ByteBuffer const& input, Optional<MimeSniff::MimeType> maybe_mime_type)
{
// 1. If the result of BOM sniffing is an encoding, return that encoding with confidence certain.
// FIXME: There is no concept of decoding certainty yet.
auto bom = run_bom_sniff(input);
if (bom.has_value())
return bom.value();
// 2. FIXME: If the user has explicitly instructed the user agent to override the document's character encoding with a specific encoding,
// optionally return that encoding with the confidence certain.
// 3. FIXME: The user agent may wait for more bytes of the resource to be available, either in this step or at any later step in this algorithm.
// For instance, a user agent might wait 500ms or 1024 bytes, whichever came first. In general preparsing the source to find the encoding improves performance,
// as it reduces the need to throw away the data structures used when parsing upon finding the encoding information. However, if the user agent delays too long
// to obtain data to determine the encoding, then the cost of the delay could outweigh any performance improvements from the preparse.
// 4. If the transport layer specifies a character encoding, and it is supported, return that encoding with the confidence certain.
if (maybe_mime_type.has_value()) {
// FIXME: This is awkward because lecacy_extract_an_encoding can not fail
auto maybe_transport_encoding = Fetch::Infrastructure::legacy_extract_an_encoding(maybe_mime_type, "invalid"sv);
if (maybe_transport_encoding != "invalid"sv)
return maybe_transport_encoding;
}
// 5. Optionally prescan the byte stream to determine its encoding, with the end condition being when the user agent decides that scanning further bytes would not
// be efficient. User agents are encouraged to only prescan the first 1024 bytes. User agents may decide that scanning any bytes is not efficient, in which case
// these substeps are entirely skipped.
// The aforementioned algorithm returns either a character encoding or failure. If it returns a character encoding, then return the same encoding, with confidence tentative.
auto prescan = run_prescan_byte_stream_algorithm(document, input);
if (prescan.has_value())
return prescan.value();
// 6. FIXME: If the HTML parser for which this algorithm is being run is associated with a Document d whose container document is non-null, then:
// 1. Let parentDocument be d's container document.
// 2. If parentDocument's origin is same origin with d's origin and parentDocument's character encoding is not UTF-16BE/LE, then return parentDocument's character
// encoding, with the confidence tentative.
// 7. Otherwise, if the user agent has information on the likely encoding for this page, e.g. based on the encoding of the page when it was last visited, then return
// that encoding, with the confidence tentative.
// 8. FIXME: The user agent may attempt to autodetect the character encoding from applying frequency analysis or other algorithms to the data stream. Such algorithms
// may use information about the resource other than the resource's contents, including the address of the resource. If autodetection succeeds in determining a
// character encoding, and that encoding is a supported encoding, then return that encoding, with the confidence tentative. [UNIVCHARDET]
if (!Utf8View(StringView(input)).validate()) {
// FIXME: As soon as Locale is supported, this should sometimes return a different encoding based on the locale.
return "windows-1252";
}
// 9. Otherwise, return an implementation-defined or user-specified default character encoding, with the confidence tentative.
// In controlled environments or in environments where the encoding of documents can be prescribed (for example, for user agents intended for dedicated use in new
// networks), the comprehensive UTF-8 encoding is suggested.
return "UTF-8";
}
}

View file

@ -0,0 +1,25 @@
/*
* Copyright (c) 2021, Max Wipfli <mail@maxwipfli.ch>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/ByteString.h>
#include <AK/Optional.h>
#include <LibJS/Heap/GCPtr.h>
#include <LibWeb/Forward.h>
namespace Web::HTML {
bool prescan_should_abort(ByteBuffer const& input, size_t const& position);
bool prescan_is_whitespace_or_slash(u8 const& byte);
bool prescan_skip_whitespace_and_slashes(ByteBuffer const& input, size_t& position);
Optional<StringView> extract_character_encoding_from_meta_element(ByteString const&);
JS::GCPtr<DOM::Attr> prescan_get_attribute(DOM::Document&, ByteBuffer const& input, size_t& position);
Optional<ByteString> run_prescan_byte_stream_algorithm(DOM::Document&, ByteBuffer const& input);
Optional<ByteString> run_bom_sniff(ByteBuffer const& input);
ByteString run_encoding_sniffing_algorithm(DOM::Document&, ByteBuffer const& input, Optional<MimeSniff::MimeType> maybe_mime_type = {});
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,221 @@
/*
* Copyright (c) 2020-2022, Andreas Kling <andreas@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <LibGfx/Color.h>
#include <LibJS/Heap/Cell.h>
#include <LibWeb/DOM/Node.h>
#include <LibWeb/HTML/Parser/HTMLTokenizer.h>
#include <LibWeb/HTML/Parser/ListOfActiveFormattingElements.h>
#include <LibWeb/HTML/Parser/StackOfOpenElements.h>
#include <LibWeb/MimeSniff/MimeType.h>
namespace Web::HTML {
#define ENUMERATE_INSERTION_MODES \
__ENUMERATE_INSERTION_MODE(Initial) \
__ENUMERATE_INSERTION_MODE(BeforeHTML) \
__ENUMERATE_INSERTION_MODE(BeforeHead) \
__ENUMERATE_INSERTION_MODE(InHead) \
__ENUMERATE_INSERTION_MODE(InHeadNoscript) \
__ENUMERATE_INSERTION_MODE(AfterHead) \
__ENUMERATE_INSERTION_MODE(InBody) \
__ENUMERATE_INSERTION_MODE(Text) \
__ENUMERATE_INSERTION_MODE(InTable) \
__ENUMERATE_INSERTION_MODE(InTableText) \
__ENUMERATE_INSERTION_MODE(InCaption) \
__ENUMERATE_INSERTION_MODE(InColumnGroup) \
__ENUMERATE_INSERTION_MODE(InTableBody) \
__ENUMERATE_INSERTION_MODE(InRow) \
__ENUMERATE_INSERTION_MODE(InCell) \
__ENUMERATE_INSERTION_MODE(InSelect) \
__ENUMERATE_INSERTION_MODE(InSelectInTable) \
__ENUMERATE_INSERTION_MODE(InTemplate) \
__ENUMERATE_INSERTION_MODE(AfterBody) \
__ENUMERATE_INSERTION_MODE(InFrameset) \
__ENUMERATE_INSERTION_MODE(AfterFrameset) \
__ENUMERATE_INSERTION_MODE(AfterAfterBody) \
__ENUMERATE_INSERTION_MODE(AfterAfterFrameset)
class HTMLParser final : public JS::Cell {
JS_CELL(HTMLParser, JS::Cell);
JS_DECLARE_ALLOCATOR(HTMLParser);
friend class HTMLTokenizer;
public:
~HTMLParser();
static JS::NonnullGCPtr<HTMLParser> create_for_scripting(DOM::Document&);
static JS::NonnullGCPtr<HTMLParser> create_with_uncertain_encoding(DOM::Document&, ByteBuffer const& input, Optional<MimeSniff::MimeType> maybe_mime_type = {});
static JS::NonnullGCPtr<HTMLParser> create(DOM::Document&, StringView input, StringView encoding);
void run(HTMLTokenizer::StopAtInsertionPoint = HTMLTokenizer::StopAtInsertionPoint::No);
void run(const URL::URL&, HTMLTokenizer::StopAtInsertionPoint = HTMLTokenizer::StopAtInsertionPoint::No);
static void the_end(JS::NonnullGCPtr<DOM::Document>, JS::GCPtr<HTMLParser> = nullptr);
DOM::Document& document();
enum class AllowDeclarativeShadowRoots {
No,
Yes,
};
static Vector<JS::Handle<DOM::Node>> parse_html_fragment(DOM::Element& context_element, StringView, AllowDeclarativeShadowRoots = AllowDeclarativeShadowRoots::No);
enum class SerializableShadowRoots {
No,
Yes,
};
static String serialize_html_fragment(DOM::Node const&, SerializableShadowRoots, Vector<JS::Handle<DOM::ShadowRoot>> const&, DOM::FragmentSerializationMode = DOM::FragmentSerializationMode::Inner);
enum class InsertionMode {
#define __ENUMERATE_INSERTION_MODE(mode) mode,
ENUMERATE_INSERTION_MODES
#undef __ENUMERATE_INSERTION_MODE
};
InsertionMode insertion_mode() const { return m_insertion_mode; }
static bool is_special_tag(FlyString const& tag_name, Optional<FlyString> const& namespace_);
HTMLTokenizer& tokenizer() { return m_tokenizer; }
// https://html.spec.whatwg.org/multipage/parsing.html#abort-a-parser
void abort();
bool aborted() const { return m_aborted; }
bool stopped() const { return m_stop_parsing; }
size_t script_nesting_level() const { return m_script_nesting_level; }
private:
HTMLParser(DOM::Document&, StringView input, StringView encoding);
HTMLParser(DOM::Document&);
virtual void visit_edges(Cell::Visitor&) override;
char const* insertion_mode_name() const;
DOM::QuirksMode which_quirks_mode(HTMLToken const&) const;
void handle_initial(HTMLToken&);
void handle_before_html(HTMLToken&);
void handle_before_head(HTMLToken&);
void handle_in_head(HTMLToken&);
void handle_in_head_noscript(HTMLToken&);
void handle_after_head(HTMLToken&);
void handle_in_body(HTMLToken&);
void handle_after_body(HTMLToken&);
void handle_after_after_body(HTMLToken&);
void handle_text(HTMLToken&);
void handle_in_table(HTMLToken&);
void handle_in_table_body(HTMLToken&);
void handle_in_row(HTMLToken&);
void handle_in_cell(HTMLToken&);
void handle_in_table_text(HTMLToken&);
void handle_in_select_in_table(HTMLToken&);
void handle_in_select(HTMLToken&);
void handle_in_caption(HTMLToken&);
void handle_in_column_group(HTMLToken&);
void handle_in_template(HTMLToken&);
void handle_in_frameset(HTMLToken&);
void handle_after_frameset(HTMLToken&);
void handle_after_after_frameset(HTMLToken&);
void stop_parsing() { m_stop_parsing = true; }
void generate_implied_end_tags(FlyString const& exception = {});
void generate_all_implied_end_tags_thoroughly();
JS::NonnullGCPtr<DOM::Element> create_element_for(HTMLToken const&, Optional<FlyString> const& namespace_, DOM::Node& intended_parent);
struct AdjustedInsertionLocation {
JS::GCPtr<DOM::Node> parent;
JS::GCPtr<DOM::Node> insert_before_sibling;
};
AdjustedInsertionLocation find_appropriate_place_for_inserting_node(JS::GCPtr<DOM::Element> override_target = nullptr);
void insert_an_element_at_the_adjusted_insertion_location(JS::NonnullGCPtr<DOM::Element>);
DOM::Text* find_character_insertion_node();
void flush_character_insertions();
enum class OnlyAddToElementStack {
No,
Yes,
};
JS::NonnullGCPtr<DOM::Element> insert_foreign_element(HTMLToken const&, Optional<FlyString> const& namespace_, OnlyAddToElementStack);
JS::NonnullGCPtr<DOM::Element> insert_html_element(HTMLToken const&);
[[nodiscard]] JS::GCPtr<DOM::Element> current_node();
[[nodiscard]] JS::GCPtr<DOM::Element> adjusted_current_node();
[[nodiscard]] JS::GCPtr<DOM::Element> node_before_current_node();
void insert_character(u32 data);
void insert_comment(HTMLToken&);
void reconstruct_the_active_formatting_elements();
void close_a_p_element();
void process_using_the_rules_for(InsertionMode, HTMLToken&);
void process_using_the_rules_for_foreign_content(HTMLToken&);
void parse_generic_raw_text_element(HTMLToken&);
void increment_script_nesting_level();
void decrement_script_nesting_level();
void reset_the_insertion_mode_appropriately();
void adjust_mathml_attributes(HTMLToken&);
void adjust_svg_tag_names(HTMLToken&);
void adjust_svg_attributes(HTMLToken&);
static void adjust_foreign_attributes(HTMLToken&);
enum AdoptionAgencyAlgorithmOutcome {
DoNothing,
RunAnyOtherEndTagSteps,
};
AdoptionAgencyAlgorithmOutcome run_the_adoption_agency_algorithm(HTMLToken&);
void clear_the_stack_back_to_a_table_context();
void clear_the_stack_back_to_a_table_body_context();
void clear_the_stack_back_to_a_table_row_context();
void close_the_cell();
InsertionMode m_insertion_mode { InsertionMode::Initial };
InsertionMode m_original_insertion_mode { InsertionMode::Initial };
StackOfOpenElements m_stack_of_open_elements;
Vector<InsertionMode> m_stack_of_template_insertion_modes;
ListOfActiveFormattingElements m_list_of_active_formatting_elements;
HTMLTokenizer m_tokenizer;
bool m_foster_parenting { false };
bool m_frameset_ok { true };
bool m_parsing_fragment { false };
// https://html.spec.whatwg.org/multipage/parsing.html#scripting-flag
// The scripting flag is set to "enabled" if scripting was enabled for the Document with which the parser is associated when the parser was created, and "disabled" otherwise.
bool m_scripting_enabled { true };
bool m_invoked_via_document_write { false };
bool m_aborted { false };
bool m_parser_pause_flag { false };
bool m_stop_parsing { false };
size_t m_script_nesting_level { 0 };
JS::Realm& realm();
JS::GCPtr<DOM::Document> m_document;
JS::GCPtr<HTMLHeadElement> m_head_element;
JS::GCPtr<HTMLFormElement> m_form_element;
JS::GCPtr<DOM::Element> m_context_element;
Vector<HTMLToken> m_pending_table_character_tokens;
JS::GCPtr<DOM::Text> m_character_insertion_node;
StringBuilder m_character_insertion_builder;
};
RefPtr<CSS::CSSStyleValue> parse_dimension_value(StringView);
RefPtr<CSS::CSSStyleValue> parse_nonzero_dimension_value(StringView);
Optional<Color> parse_legacy_color_value(StringView);
}

View file

@ -0,0 +1,105 @@
/*
* Copyright (c) 2020, Andreas Kling <andreas@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <AK/HashTable.h>
#include <LibWeb/HTML/Parser/HTMLToken.h>
namespace Web::HTML {
String HTMLToken::to_string() const
{
StringBuilder builder;
switch (type()) {
case HTMLToken::Type::DOCTYPE:
builder.append("DOCTYPE"sv);
builder.append(" { name: '"sv);
builder.append(doctype_data().name);
builder.append("' }"sv);
break;
case HTMLToken::Type::StartTag:
builder.append("StartTag"sv);
break;
case HTMLToken::Type::EndTag:
builder.append("EndTag"sv);
break;
case HTMLToken::Type::Comment:
builder.append("Comment"sv);
break;
case HTMLToken::Type::Character:
builder.append("Character"sv);
break;
case HTMLToken::Type::EndOfFile:
builder.append("EndOfFile"sv);
break;
case HTMLToken::Type::Invalid:
VERIFY_NOT_REACHED();
}
if (type() == HTMLToken::Type::StartTag || type() == HTMLToken::Type::EndTag) {
builder.append(" { name: '"sv);
builder.append(tag_name());
builder.append("', { "sv);
for_each_attribute([&](auto& attribute) {
builder.append(attribute.local_name);
builder.append("=\""sv);
builder.append(attribute.value);
builder.append("\" "sv);
return IterationDecision::Continue;
});
builder.append("} }"sv);
}
if (is_comment()) {
builder.append(" { data: '"sv);
builder.append(comment());
builder.append("' }"sv);
}
if (is_character()) {
builder.append(" { data: '"sv);
builder.append_code_point(code_point());
builder.append("' }"sv);
}
if (type() == HTMLToken::Type::Character) {
builder.appendff("@{}:{}", m_start_position.line, m_start_position.column);
} else {
builder.appendff("@{}:{}-{}:{}", m_start_position.line, m_start_position.column, m_end_position.line, m_end_position.column);
}
return MUST(builder.to_string());
}
void HTMLToken::normalize_attributes()
{
// From AttributeNameState: https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
//
// When the user agent leaves the attribute name state (and before emitting the tag token, if appropriate),
// the complete attribute's name must be compared to the other attributes on the same token;
// if there is already an attribute on the token with the exact same name, then this is a duplicate-attribute
// parse error and the new attribute must be removed from the token.
// NOTE: If an attribute is so removed from a token, it, and the value that gets associated with it, if any,
// are never subsequently used by the parser, and are therefore effectively discarded. Removing the attribute
// in this way does not change its status as the "current attribute" for the purposes of the tokenizer, however.
HashTable<FlyString> seen_attributes;
auto* ptr = tag_attributes();
if (!ptr)
return;
auto& tag_attributes = *ptr;
for (size_t i = 0; i < tag_attributes.size(); ++i) {
auto& attribute = tag_attributes[i];
if (seen_attributes.set(attribute.local_name, AK::HashSetExistingEntryBehavior::Keep) == AK::HashSetResult::KeptExistingEntry) {
// This is a duplicate attribute, remove it.
tag_attributes.remove(i);
--i;
}
}
}
}

View file

@ -0,0 +1,371 @@
/*
* Copyright (c) 2020, Andreas Kling <andreas@ladybird.org>
* Copyright (c) 2021, Max Wipfli <max.wipfli@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/FlyString.h>
#include <AK/Function.h>
#include <AK/OwnPtr.h>
#include <AK/Types.h>
#include <AK/Variant.h>
#include <AK/Vector.h>
namespace Web::HTML {
class HTMLTokenizer;
class HTMLToken {
AK_MAKE_NONCOPYABLE(HTMLToken);
AK_MAKE_DEFAULT_MOVABLE(HTMLToken);
public:
enum class Type : u8 {
Invalid,
DOCTYPE,
StartTag,
EndTag,
Comment,
Character,
EndOfFile,
};
struct Position {
size_t line { 0 };
size_t column { 0 };
size_t byte_offset { 0 };
};
struct Attribute {
Optional<FlyString> prefix;
FlyString local_name;
Optional<FlyString> namespace_;
String value;
Position name_start_position;
Position value_start_position;
Position name_end_position;
Position value_end_position;
};
struct DoctypeData {
// NOTE: "Missing" is a distinct state from the empty string.
String name;
String public_identifier;
String system_identifier;
bool missing_name { true };
bool missing_public_identifier { true };
bool missing_system_identifier { true };
bool force_quirks { false };
};
static HTMLToken make_character(u32 code_point)
{
HTMLToken token { Type::Character };
token.set_code_point(code_point);
return token;
}
static HTMLToken make_start_tag(FlyString const& tag_name)
{
HTMLToken token { Type::StartTag };
token.set_tag_name(tag_name);
return token;
}
HTMLToken() = default;
HTMLToken(Type type)
: m_type(type)
{
switch (m_type) {
case Type::Character:
m_data.set(0u);
break;
case Type::DOCTYPE:
m_data.set(OwnPtr<DoctypeData> {});
break;
case Type::StartTag:
case Type::EndTag:
m_data.set(OwnPtr<Vector<Attribute>>());
break;
default:
break;
}
}
bool is_doctype() const { return m_type == Type::DOCTYPE; }
bool is_start_tag() const { return m_type == Type::StartTag; }
bool is_end_tag() const { return m_type == Type::EndTag; }
bool is_comment() const { return m_type == Type::Comment; }
bool is_character() const { return m_type == Type::Character; }
bool is_end_of_file() const { return m_type == Type::EndOfFile; }
u32 code_point() const
{
VERIFY(is_character());
return m_data.get<u32>();
}
bool is_parser_whitespace() const
{
// NOTE: The parser considers '\r' to be whitespace, while the tokenizer does not.
if (!is_character())
return false;
switch (code_point()) {
case '\t':
case '\n':
case '\f':
case '\r':
case ' ':
return true;
default:
return false;
}
}
void set_code_point(u32 code_point)
{
VERIFY(is_character());
m_data.get<u32>() = code_point;
}
String const& comment() const
{
VERIFY(is_comment());
return m_comment_data;
}
void set_comment(String comment)
{
VERIFY(is_comment());
m_comment_data = move(comment);
}
FlyString const& tag_name() const
{
VERIFY(is_start_tag() || is_end_tag());
return m_string_data;
}
void set_tag_name(FlyString name)
{
VERIFY(is_start_tag() || is_end_tag());
m_string_data = move(name);
}
bool is_self_closing() const
{
VERIFY(is_start_tag() || is_end_tag());
return m_tag_self_closing;
}
void set_self_closing(bool self_closing)
{
VERIFY(is_start_tag() || is_end_tag());
m_tag_self_closing = self_closing;
}
bool has_acknowledged_self_closing_flag() const
{
VERIFY(is_self_closing());
return m_tag_self_closing_acknowledged;
}
void acknowledge_self_closing_flag_if_set()
{
if (is_self_closing())
m_tag_self_closing_acknowledged = true;
}
bool has_attributes() const
{
VERIFY(is_start_tag() || is_end_tag());
auto* ptr = tag_attributes();
return ptr && !ptr->is_empty();
}
size_t attribute_count() const
{
VERIFY(is_start_tag() || is_end_tag());
if (auto* ptr = tag_attributes())
return ptr->size();
return 0;
}
void add_attribute(Attribute attribute)
{
VERIFY(is_start_tag() || is_end_tag());
ensure_tag_attributes().append(move(attribute));
}
Attribute const& last_attribute() const
{
VERIFY(is_start_tag() || is_end_tag());
VERIFY(has_attributes());
return tag_attributes()->last();
}
Attribute& last_attribute()
{
VERIFY(is_start_tag() || is_end_tag());
VERIFY(has_attributes());
return tag_attributes()->last();
}
void drop_attributes()
{
VERIFY(is_start_tag() || is_end_tag());
m_data.get<OwnPtr<Vector<Attribute>>>().clear();
}
void for_each_attribute(Function<IterationDecision(Attribute const&)> callback) const
{
VERIFY(is_start_tag() || is_end_tag());
auto* ptr = tag_attributes();
if (!ptr)
return;
for (auto& attribute : *ptr) {
if (callback(attribute) == IterationDecision::Break)
break;
}
}
void for_each_attribute(Function<IterationDecision(Attribute&)> callback)
{
VERIFY(is_start_tag() || is_end_tag());
auto* ptr = tag_attributes();
if (!ptr)
return;
for (auto& attribute : *ptr) {
if (callback(attribute) == IterationDecision::Break)
break;
}
}
Optional<String> attribute(FlyString const& attribute_name) const
{
if (auto result = raw_attribute(attribute_name); result.has_value())
return result->value;
return {};
}
Optional<Attribute const&> raw_attribute(FlyString const& attribute_name) const
{
VERIFY(is_start_tag() || is_end_tag());
auto* ptr = tag_attributes();
if (!ptr)
return {};
for (auto const& attribute : *ptr) {
if (attribute_name == attribute.local_name)
return attribute;
}
return {};
}
bool has_attribute(FlyString const& attribute_name) const
{
return attribute(attribute_name).has_value();
}
void adjust_tag_name(FlyString const& old_name, FlyString const& new_name)
{
VERIFY(is_start_tag() || is_end_tag());
if (old_name == tag_name())
set_tag_name(new_name);
}
void adjust_attribute_name(FlyString const& old_name, FlyString const& new_name)
{
VERIFY(is_start_tag() || is_end_tag());
for_each_attribute([&](Attribute& attribute) {
if (old_name == attribute.local_name)
attribute.local_name = new_name;
return IterationDecision::Continue;
});
}
void adjust_foreign_attribute(FlyString const& old_name, Optional<FlyString> const& prefix, FlyString const& local_name, Optional<FlyString> const& namespace_)
{
VERIFY(is_start_tag() || is_end_tag());
for_each_attribute([&](Attribute& attribute) {
if (old_name == attribute.local_name) {
attribute.prefix = prefix;
attribute.local_name = local_name;
attribute.namespace_ = namespace_;
}
return IterationDecision::Continue;
});
}
DoctypeData const& doctype_data() const
{
VERIFY(is_doctype());
auto* ptr = m_data.get<OwnPtr<DoctypeData>>().ptr();
VERIFY(ptr);
return *ptr;
}
DoctypeData& ensure_doctype_data()
{
VERIFY(is_doctype());
auto& ptr = m_data.get<OwnPtr<DoctypeData>>();
if (!ptr)
ptr = make<DoctypeData>();
return *ptr;
}
Type type() const { return m_type; }
String to_string() const;
Position const& start_position() const { return m_start_position; }
Position const& end_position() const { return m_end_position; }
void set_start_position(Badge<HTMLTokenizer>, Position start_position) { m_start_position = start_position; }
void set_end_position(Badge<HTMLTokenizer>, Position end_position) { m_end_position = end_position; }
void normalize_attributes();
private:
Vector<Attribute> const* tag_attributes() const
{
return m_data.get<OwnPtr<Vector<Attribute>>>().ptr();
}
Vector<Attribute>* tag_attributes()
{
return m_data.get<OwnPtr<Vector<Attribute>>>().ptr();
}
Vector<Attribute>& ensure_tag_attributes()
{
VERIFY(is_start_tag() || is_end_tag());
auto& ptr = m_data.get<OwnPtr<Vector<Attribute>>>();
if (!ptr)
ptr = make<Vector<Attribute>>();
return *ptr;
}
Type m_type { Type::Invalid };
// Type::StartTag and Type::EndTag
bool m_tag_self_closing { false };
bool m_tag_self_closing_acknowledged { false };
// Type::StartTag and Type::EndTag (tag name)
FlyString m_string_data;
// Type::Comment (comment data)
String m_comment_data;
Variant<Empty, u32, OwnPtr<DoctypeData>, OwnPtr<Vector<Attribute>>> m_data {};
Position m_start_position;
Position m_end_position;
};
}

View file

@ -0,0 +1,211 @@
/*
* Copyright (c) 2024, Andrew Kaster <andrew@ladybird.org>>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
@_exported import WebCxx
public class HTMLToken {
public struct Position: Equatable {
var line = UInt()
var column = UInt()
var byteOffset = UInt()
}
public struct Attribute: Equatable {
public var prefix: Swift.String? = nil
public var localName: Swift.String
public var namespace_: Swift.String? = nil
public var value: Swift.String
public var nameStartPosition = Position()
public var nameEndPosition = Position()
public var valueStartPosition = Position()
public var valueEndPosition = Position()
public init(localName: Swift.String, value: Swift.String) {
self.localName = localName
self.value = value
}
}
public enum TokenType: Equatable {
case Invalid
case DOCTYPE(
name: Swift.String?,
publicIdentifier: Swift.String?,
systemIdentifier: Swift.String?,
forceQuirksMode: Bool)
case StartTag(
tagName: Swift.String,
selfClosing: Bool = false,
selfClosingAcknowledged: Bool = false,
attributes: [Attribute] = [])
case EndTag(
tagName: Swift.String,
selfClosing: Bool = false,
selfClosingAcknowledged: Bool = false,
attributes: [Attribute] = [])
case Comment(data: Swift.String)
case Character(codePoint: Character)
case EndOfFile
}
public func isCharacter() -> Bool {
if case .Character(_) = self.type {
return true
}
return false
}
public func isEndTag() -> Bool {
if case .EndTag(_, _, _, _) = self.type {
return true
}
return false
}
public func isStartTag() -> Bool {
if case .StartTag(_, _, _, _) = self.type {
return true
}
return false
}
public func isTag() -> Bool {
return isStartTag() || isEndTag()
}
public func isParserWhitespace() -> Bool {
precondition(isCharacter(), "isParserWhitespace() called on non-character token")
// NOTE: The parser considers '\r' to be whitespace, while the tokenizer does not.
switch self.type {
case .Character(codePoint: "\t"),
.Character(codePoint: "\n"),
.Character(codePoint: "\u{000C}"), // \f
.Character(codePoint: "\r"),
.Character(codePoint: " "):
return true
default:
return false
}
}
public var type = TokenType.Invalid
public var startPosition = Position()
public var endPosition = Position()
// Is in-place mutating enums a thing? Seems not https://forums.swift.org/t/in-place-mutation-of-an-enum-associated-value/11747
public var attributes: [Attribute] {
get {
switch self.type {
case .StartTag(_, _, _, let attributes):
return attributes
case .EndTag(_, _, _, let attributes):
return attributes
default:
preconditionFailure("attributes called on non-tag token")
}
}
set {
switch self.type {
case .StartTag(let tagName, let selfClosing, let selfClosingAcknowledged, attributes: _):
self.type = .StartTag(tagName: tagName, selfClosing: selfClosing, selfClosingAcknowledged: selfClosingAcknowledged, attributes: newValue)
case .EndTag(let tagName, let selfClosing, let selfClosingAcknowledged, attributes: _):
self.type = .EndTag(tagName: tagName, selfClosing: selfClosing, selfClosingAcknowledged: selfClosingAcknowledged, attributes: newValue)
default:
preconditionFailure("attributes= called on non-tag token")
}
}
}
public var tagName: Swift.String {
get {
switch self.type {
case .StartTag(let tagName, _, _, _):
return tagName
case .EndTag(let tagName, _, _, _):
return tagName
default:
preconditionFailure("tagName called on non-tag token")
}
}
set {
switch self.type {
case .StartTag(tagName: _, let selfClosing, let selfClosingAcknowledged, let attributes):
self.type = .StartTag(tagName: newValue, selfClosing: selfClosing, selfClosingAcknowledged: selfClosingAcknowledged, attributes: attributes)
case .EndTag(tagName: _, let selfClosing, let selfClosingAcknowledged, let attributes):
self.type = .EndTag(tagName: newValue, selfClosing: selfClosing, selfClosingAcknowledged: selfClosingAcknowledged, attributes: attributes)
default:
preconditionFailure("tagName= called on non-tag token")
}
}
}
public var selfClosing: Bool {
get {
switch self.type {
case .StartTag(_, let selfClosing, _, _):
return selfClosing
case .EndTag(_, let selfClosing, _, _):
return selfClosing
default:
preconditionFailure("selfClosing called on non-tag token")
}
}
set {
switch self.type {
case .StartTag(let tagName, _, let selfClosingAcknowledged, let attributes):
self.type = .StartTag(tagName: tagName, selfClosing: newValue, selfClosingAcknowledged: selfClosingAcknowledged, attributes: attributes)
case .EndTag(let tagName, _, let selfClosingAcknowledged, let attributes):
self.type = .EndTag(tagName: tagName, selfClosing: newValue, selfClosingAcknowledged: selfClosingAcknowledged, attributes: attributes)
default:
preconditionFailure("selfClosing= called on non-tag token")
}
}
}
public init() {}
public init(type: TokenType) {
self.type = type
}
}
extension HTMLToken.Position: CustomStringConvertible {
public var description: Swift.String {
return "\(self.line):\(self.column)"
}
}
extension HTMLToken.TokenType: CustomStringConvertible {
// FIXME: Print attributes for start/end tags
public var description: Swift.String {
switch self {
case .Invalid:
return "Invalid"
case .DOCTYPE(let name, let publicIdentifier, let systemIdentifier, let forceQuirksMode):
return "DOCTYPE(name: \(name ?? "nil"), publicIdentifier: \(publicIdentifier ?? "nil"), systemIdentifier: \(systemIdentifier ?? "nil"), forceQuirksMode: \(forceQuirksMode))"
case .StartTag(let tagName, let selfClosing, let selfClosingAcknowledged, let attributes):
return "StartTag(tagName: \(tagName), selfClosing: \(selfClosing), selfClosingAcknowledged: \(selfClosingAcknowledged), attributes: \(attributes))"
case .EndTag(let tagName, let selfClosing, let selfClosingAcknowledged, let attributes):
return "EndTag(tagName: \(tagName), selfClosing: \(selfClosing), selfClosingAcknowledged: \(selfClosingAcknowledged), attributes: \(attributes))"
case .Comment(let data):
return "Comment(data: \(data))"
case .Character(let codePoint):
return "Character(codePoint: \(codePoint))"
case .EndOfFile:
return "EndOfFile"
}
}
}
extension HTMLToken: CustomStringConvertible {
public var description: Swift.String {
if self.startPosition == Position() {
return "HTMLToken(type: \(self.type))"
} else if self.endPosition == Position() {
return "HTMLToken(type: \(self.type))@\(self.startPosition)"
} else {
return "HTMLToken(type: \(self.type))@\(self.startPosition)-\(self.endPosition)"
}
}
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,223 @@
/*
* Copyright (c) 2020, Andreas Kling <andreas@ladybird.org>
* Copyright (c) 2022, Linus Groh <linusg@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/Queue.h>
#include <AK/StringBuilder.h>
#include <AK/StringView.h>
#include <AK/Types.h>
#include <AK/Utf8View.h>
#include <LibJS/Heap/GCPtr.h>
#include <LibWeb/Forward.h>
#include <LibWeb/HTML/Parser/HTMLToken.h>
namespace Web::HTML {
#define ENUMERATE_TOKENIZER_STATES \
__ENUMERATE_TOKENIZER_STATE(Data) \
__ENUMERATE_TOKENIZER_STATE(RCDATA) \
__ENUMERATE_TOKENIZER_STATE(RAWTEXT) \
__ENUMERATE_TOKENIZER_STATE(ScriptData) \
__ENUMERATE_TOKENIZER_STATE(PLAINTEXT) \
__ENUMERATE_TOKENIZER_STATE(TagOpen) \
__ENUMERATE_TOKENIZER_STATE(EndTagOpen) \
__ENUMERATE_TOKENIZER_STATE(TagName) \
__ENUMERATE_TOKENIZER_STATE(RCDATALessThanSign) \
__ENUMERATE_TOKENIZER_STATE(RCDATAEndTagOpen) \
__ENUMERATE_TOKENIZER_STATE(RCDATAEndTagName) \
__ENUMERATE_TOKENIZER_STATE(RAWTEXTLessThanSign) \
__ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagOpen) \
__ENUMERATE_TOKENIZER_STATE(RAWTEXTEndTagName) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataLessThanSign) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagOpen) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEndTagName) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStart) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapeStartDash) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscaped) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDash) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedDashDash) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedLessThanSign) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagOpen) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataEscapedEndTagName) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeStart) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscaped) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDash) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedDashDash) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapedLessThanSign) \
__ENUMERATE_TOKENIZER_STATE(ScriptDataDoubleEscapeEnd) \
__ENUMERATE_TOKENIZER_STATE(BeforeAttributeName) \
__ENUMERATE_TOKENIZER_STATE(AttributeName) \
__ENUMERATE_TOKENIZER_STATE(AfterAttributeName) \
__ENUMERATE_TOKENIZER_STATE(BeforeAttributeValue) \
__ENUMERATE_TOKENIZER_STATE(AttributeValueDoubleQuoted) \
__ENUMERATE_TOKENIZER_STATE(AttributeValueSingleQuoted) \
__ENUMERATE_TOKENIZER_STATE(AttributeValueUnquoted) \
__ENUMERATE_TOKENIZER_STATE(AfterAttributeValueQuoted) \
__ENUMERATE_TOKENIZER_STATE(SelfClosingStartTag) \
__ENUMERATE_TOKENIZER_STATE(BogusComment) \
__ENUMERATE_TOKENIZER_STATE(MarkupDeclarationOpen) \
__ENUMERATE_TOKENIZER_STATE(CommentStart) \
__ENUMERATE_TOKENIZER_STATE(CommentStartDash) \
__ENUMERATE_TOKENIZER_STATE(Comment) \
__ENUMERATE_TOKENIZER_STATE(CommentLessThanSign) \
__ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBang) \
__ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDash) \
__ENUMERATE_TOKENIZER_STATE(CommentLessThanSignBangDashDash) \
__ENUMERATE_TOKENIZER_STATE(CommentEndDash) \
__ENUMERATE_TOKENIZER_STATE(CommentEnd) \
__ENUMERATE_TOKENIZER_STATE(CommentEndBang) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPE) \
__ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEName) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPEName) \
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEName) \
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicKeyword) \
__ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPEPublicIdentifier) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierDoubleQuoted) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPEPublicIdentifierSingleQuoted) \
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPEPublicIdentifier) \
__ENUMERATE_TOKENIZER_STATE(BetweenDOCTYPEPublicAndSystemIdentifiers) \
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemKeyword) \
__ENUMERATE_TOKENIZER_STATE(BeforeDOCTYPESystemIdentifier) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierDoubleQuoted) \
__ENUMERATE_TOKENIZER_STATE(DOCTYPESystemIdentifierSingleQuoted) \
__ENUMERATE_TOKENIZER_STATE(AfterDOCTYPESystemIdentifier) \
__ENUMERATE_TOKENIZER_STATE(BogusDOCTYPE) \
__ENUMERATE_TOKENIZER_STATE(CDATASection) \
__ENUMERATE_TOKENIZER_STATE(CDATASectionBracket) \
__ENUMERATE_TOKENIZER_STATE(CDATASectionEnd) \
__ENUMERATE_TOKENIZER_STATE(CharacterReference) \
__ENUMERATE_TOKENIZER_STATE(NamedCharacterReference) \
__ENUMERATE_TOKENIZER_STATE(AmbiguousAmpersand) \
__ENUMERATE_TOKENIZER_STATE(NumericCharacterReference) \
__ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReferenceStart) \
__ENUMERATE_TOKENIZER_STATE(DecimalCharacterReferenceStart) \
__ENUMERATE_TOKENIZER_STATE(HexadecimalCharacterReference) \
__ENUMERATE_TOKENIZER_STATE(DecimalCharacterReference) \
__ENUMERATE_TOKENIZER_STATE(NumericCharacterReferenceEnd)
class HTMLTokenizer {
public:
explicit HTMLTokenizer();
explicit HTMLTokenizer(StringView input, ByteString const& encoding);
enum class State {
#define __ENUMERATE_TOKENIZER_STATE(state) state,
ENUMERATE_TOKENIZER_STATES
#undef __ENUMERATE_TOKENIZER_STATE
};
enum class StopAtInsertionPoint {
No,
Yes,
};
Optional<HTMLToken> next_token(StopAtInsertionPoint = StopAtInsertionPoint::No);
void set_parser(Badge<HTMLParser>, HTMLParser& parser) { m_parser = &parser; }
void switch_to(Badge<HTMLParser>, State new_state);
void switch_to(State new_state)
{
m_state = new_state;
}
void set_blocked(bool b) { m_blocked = b; }
bool is_blocked() const { return m_blocked; }
ByteString source() const { return m_decoded_input; }
void insert_input_at_insertion_point(StringView input);
void insert_eof();
bool is_eof_inserted();
bool is_insertion_point_defined() const { return m_insertion_point.defined; }
bool is_insertion_point_reached()
{
return m_insertion_point.defined && m_utf8_view.iterator_offset(m_utf8_iterator) >= m_insertion_point.position;
}
void undefine_insertion_point() { m_insertion_point.defined = false; }
void store_insertion_point() { m_old_insertion_point = m_insertion_point; }
void restore_insertion_point() { m_insertion_point = m_old_insertion_point; }
void update_insertion_point()
{
m_insertion_point.defined = true;
m_insertion_point.position = m_utf8_view.iterator_offset(m_utf8_iterator);
}
// This permanently cuts off the tokenizer input stream.
void abort() { m_aborted = true; }
private:
void skip(size_t count);
Optional<u32> next_code_point();
Optional<u32> peek_code_point(size_t offset) const;
bool consume_next_if_match(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive);
void create_new_token(HTMLToken::Type);
bool current_end_tag_token_is_appropriate() const;
String consume_current_builder();
static char const* state_name(State state)
{
switch (state) {
#define __ENUMERATE_TOKENIZER_STATE(state) \
case State::state: \
return #state;
ENUMERATE_TOKENIZER_STATES
#undef __ENUMERATE_TOKENIZER_STATE
};
VERIFY_NOT_REACHED();
}
void will_emit(HTMLToken&);
void will_switch_to(State);
void will_reconsume_in(State);
bool consumed_as_part_of_an_attribute() const;
void restore_to(Utf8CodePointIterator const& new_iterator);
HTMLToken::Position nth_last_position(size_t n = 0);
JS::GCPtr<HTMLParser> m_parser;
State m_state { State::Data };
State m_return_state { State::Data };
Vector<u32> m_temporary_buffer;
ByteString m_decoded_input;
struct InsertionPoint {
size_t position { 0 };
bool defined { false };
};
InsertionPoint m_insertion_point {};
InsertionPoint m_old_insertion_point {};
Utf8View m_utf8_view;
Utf8CodePointIterator m_utf8_iterator;
Utf8CodePointIterator m_prev_utf8_iterator;
HTMLToken m_current_token;
StringBuilder m_current_builder;
Optional<ByteString> m_last_emitted_start_tag_name;
bool m_explicit_eof_inserted { false };
bool m_has_emitted_eof { false };
Queue<HTMLToken> m_queued_tokens;
u32 m_character_reference_code { 0 };
bool m_blocked { false };
bool m_aborted { false };
Vector<HTMLToken::Position> m_source_positions;
};
}

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,23 @@
/*
* Copyright (c) 2024, Andrew Kaster <akaster@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibTextCodec/Decoder.h>
#include <LibWeb/HTML/Parser/HTMLTokenizerHelpers.h>
namespace Web::HTML {
OptionalString decode_to_utf8(StringView text, StringView encoding)
{
auto decoder = TextCodec::decoder_for(encoding);
if (!decoder.has_value())
return std::nullopt;
auto decoded_or_error = decoder.value().to_utf8(text);
if (decoded_or_error.is_error())
return std::nullopt;
return decoded_or_error.release_value();
}
}

View file

@ -0,0 +1,19 @@
/*
* Copyright (c) 2024, Andrew Kaster <akaster@serenityos.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <AK/String.h>
#include <AK/StringView.h>
#include <optional>
namespace Web::HTML {
// Swift-friendly wrapper for TextCodec::Decoder::to_utf8
using OptionalString = std::optional<String>;
OptionalString decode_to_utf8(StringView text, StringView encoding);
}

View file

@ -0,0 +1,90 @@
/*
* Copyright (c) 2020-2022, Andreas Kling <andreas@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibWeb/DOM/Element.h>
#include <LibWeb/HTML/Parser/ListOfActiveFormattingElements.h>
namespace Web::HTML {
ListOfActiveFormattingElements::~ListOfActiveFormattingElements() = default;
void ListOfActiveFormattingElements::visit_edges(JS::Cell::Visitor& visitor)
{
for (auto& entry : m_entries)
visitor.visit(entry.element);
}
void ListOfActiveFormattingElements::add(DOM::Element& element)
{
// FIXME: Implement the Noah's Ark clause https://html.spec.whatwg.org/multipage/parsing.html#push-onto-the-list-of-active-formatting-elements
m_entries.append({ element });
}
void ListOfActiveFormattingElements::add_marker()
{
m_entries.append({ nullptr });
}
bool ListOfActiveFormattingElements::contains(const DOM::Element& element) const
{
for (auto& entry : m_entries) {
if (entry.element.ptr() == &element)
return true;
}
return false;
}
DOM::Element* ListOfActiveFormattingElements::last_element_with_tag_name_before_marker(FlyString const& tag_name)
{
for (ssize_t i = m_entries.size() - 1; i >= 0; --i) {
auto& entry = m_entries[i];
if (entry.is_marker())
return nullptr;
if (entry.element->local_name() == tag_name)
return entry.element.ptr();
}
return nullptr;
}
void ListOfActiveFormattingElements::remove(DOM::Element& element)
{
m_entries.remove_first_matching([&](auto& entry) {
return entry.element.ptr() == &element;
});
}
void ListOfActiveFormattingElements::clear_up_to_the_last_marker()
{
while (!m_entries.is_empty()) {
auto entry = m_entries.take_last();
if (entry.is_marker())
break;
}
}
Optional<size_t> ListOfActiveFormattingElements::find_index(DOM::Element const& element) const
{
for (size_t i = 0; i < m_entries.size(); i++) {
if (m_entries[i].element.ptr() == &element)
return i;
}
return {};
}
void ListOfActiveFormattingElements::replace(DOM::Element& to_remove, DOM::Element& to_add)
{
for (size_t i = 0; i < m_entries.size(); i++) {
if (m_entries[i].element.ptr() == &to_remove)
m_entries[i].element = JS::make_handle(to_add);
}
}
void ListOfActiveFormattingElements::insert_at(size_t index, DOM::Element& element)
{
m_entries.insert(index, { element });
}
}

View file

@ -0,0 +1,51 @@
/*
* Copyright (c) 2020-2022, Andreas Kling <andreas@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <LibWeb/DOM/Element.h>
#include <LibWeb/Forward.h>
namespace Web::HTML {
class ListOfActiveFormattingElements {
public:
ListOfActiveFormattingElements() = default;
~ListOfActiveFormattingElements();
struct Entry {
bool is_marker() const { return !element; }
JS::GCPtr<DOM::Element> element;
};
bool is_empty() const { return m_entries.is_empty(); }
bool contains(const DOM::Element&) const;
void add(DOM::Element& element);
void add_marker();
void insert_at(size_t index, DOM::Element& element);
void replace(DOM::Element& to_remove, DOM::Element& to_add);
void remove(DOM::Element&);
Vector<Entry> const& entries() const { return m_entries; }
Vector<Entry>& entries() { return m_entries; }
DOM::Element* last_element_with_tag_name_before_marker(FlyString const& tag_name);
void clear_up_to_the_last_marker();
Optional<size_t> find_index(DOM::Element const&) const;
void visit_edges(JS::Cell::Visitor&);
private:
Vector<Entry> m_entries;
};
}

View file

@ -0,0 +1,189 @@
/*
* Copyright (c) 2020-2022, Andreas Kling <andreas@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#include <LibWeb/DOM/Element.h>
#include <LibWeb/HTML/Parser/HTMLParser.h>
#include <LibWeb/HTML/Parser/StackOfOpenElements.h>
#include <LibWeb/Namespace.h>
namespace Web::HTML {
static Vector<FlyString> s_base_list { "applet"_fly_string, "caption"_fly_string, "html"_fly_string, "table"_fly_string, "td"_fly_string, "th"_fly_string, "marquee"_fly_string, "object"_fly_string, "template"_fly_string };
StackOfOpenElements::~StackOfOpenElements() = default;
void StackOfOpenElements::visit_edges(JS::Cell::Visitor& visitor)
{
visitor.visit(m_elements);
}
bool StackOfOpenElements::has_in_scope_impl(FlyString const& tag_name, Vector<FlyString> const& list) const
{
for (auto const& element : m_elements.in_reverse()) {
if (element->local_name() == tag_name)
return true;
if (list.contains_slow(element->local_name()))
return false;
}
VERIFY_NOT_REACHED();
}
bool StackOfOpenElements::has_in_scope(FlyString const& tag_name) const
{
return has_in_scope_impl(tag_name, s_base_list);
}
bool StackOfOpenElements::has_in_scope_impl(const DOM::Element& target_node, Vector<FlyString> const& list) const
{
for (auto& element : m_elements.in_reverse()) {
if (element.ptr() == &target_node)
return true;
if (list.contains_slow(element->local_name()))
return false;
}
VERIFY_NOT_REACHED();
}
bool StackOfOpenElements::has_in_scope(const DOM::Element& target_node) const
{
return has_in_scope_impl(target_node, s_base_list);
}
bool StackOfOpenElements::has_in_button_scope(FlyString const& tag_name) const
{
auto list = s_base_list;
list.append("button"_fly_string);
return has_in_scope_impl(tag_name, list);
}
bool StackOfOpenElements::has_in_table_scope(FlyString const& tag_name) const
{
return has_in_scope_impl(tag_name, { "html"_fly_string, "table"_fly_string, "template"_fly_string });
}
bool StackOfOpenElements::has_in_list_item_scope(FlyString const& tag_name) const
{
auto list = s_base_list;
list.append("ol"_fly_string);
list.append("ul"_fly_string);
return has_in_scope_impl(tag_name, list);
}
// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-select-scope
// The stack of open elements is said to have a particular element in select scope
// when it has that element in the specific scope consisting of all element types except the following:
// - optgroup in the HTML namespace
// - option in the HTML namespace
// NOTE: In this case it's "all element types _except_"
bool StackOfOpenElements::has_in_select_scope(FlyString const& tag_name) const
{
// https://html.spec.whatwg.org/multipage/parsing.html#has-an-element-in-the-specific-scope
// 1. Initialize node to be the current node (the bottommost node of the stack).
for (auto& node : m_elements.in_reverse()) {
// 2. If node is the target node, terminate in a match state.
if (node->local_name() == tag_name)
return true;
// 3. Otherwise, if node is one of the element types in list, terminate in a failure state.
// NOTE: Here "list" refers to all elements except option and optgroup
if (node->local_name() != HTML::TagNames::option && node->local_name() != HTML::TagNames::optgroup)
return false;
// 4. Otherwise, set node to the previous entry in the stack of open elements and return to step 2.
}
// [4.] (This will never fail, since the loop will always terminate in the previous step if the top of the stack
// — an html element — is reached.)
VERIFY_NOT_REACHED();
}
bool StackOfOpenElements::contains(const DOM::Element& element) const
{
for (auto& element_on_stack : m_elements) {
if (&element == element_on_stack.ptr())
return true;
}
return false;
}
bool StackOfOpenElements::contains_template_element() const
{
for (auto const& element : m_elements) {
if (element->namespace_uri() != Namespace::HTML)
continue;
if (element->local_name() == HTML::TagNames::template_)
return true;
}
return false;
}
void StackOfOpenElements::pop_until_an_element_with_tag_name_has_been_popped(FlyString const& tag_name)
{
while (m_elements.last()->namespace_uri() != Namespace::HTML || m_elements.last()->local_name() != tag_name)
(void)pop();
(void)pop();
}
JS::GCPtr<DOM::Element> StackOfOpenElements::topmost_special_node_below(DOM::Element const& formatting_element)
{
JS::GCPtr<DOM::Element> found_element = nullptr;
for (auto& element : m_elements.in_reverse()) {
if (element.ptr() == &formatting_element)
break;
if (HTMLParser::is_special_tag(element->local_name(), element->namespace_uri()))
found_element = element.ptr();
}
return found_element.ptr();
}
StackOfOpenElements::LastElementResult StackOfOpenElements::last_element_with_tag_name(FlyString const& tag_name)
{
for (ssize_t i = m_elements.size() - 1; i >= 0; --i) {
auto& element = m_elements[i];
if (element->local_name() == tag_name)
return { element.ptr(), i };
}
return { nullptr, -1 };
}
JS::GCPtr<DOM::Element> StackOfOpenElements::element_immediately_above(DOM::Element const& target)
{
bool found_target = false;
for (auto& element : m_elements.in_reverse()) {
if (element.ptr() == &target) {
found_target = true;
} else if (found_target)
return element.ptr();
}
return nullptr;
}
void StackOfOpenElements::remove(DOM::Element const& element)
{
m_elements.remove_first_matching([&element](auto& other) {
return other.ptr() == &element;
});
}
void StackOfOpenElements::replace(DOM::Element const& to_remove, JS::NonnullGCPtr<DOM::Element> to_add)
{
for (size_t i = 0; i < m_elements.size(); i++) {
if (m_elements[i].ptr() == &to_remove) {
m_elements.remove(i);
m_elements.insert(i, to_add);
break;
}
}
}
void StackOfOpenElements::insert_immediately_below(JS::NonnullGCPtr<DOM::Element> element_to_add, DOM::Element const& target)
{
for (size_t i = 0; i < m_elements.size(); i++) {
if (m_elements[i].ptr() == &target) {
m_elements.insert(i + 1, element_to_add);
break;
}
}
}
}

View file

@ -0,0 +1,72 @@
/*
* Copyright (c) 2020-2022, Andreas Kling <andreas@ladybird.org>
*
* SPDX-License-Identifier: BSD-2-Clause
*/
#pragma once
#include <LibWeb/DOM/Element.h>
#include <LibWeb/Forward.h>
namespace Web::HTML {
// https://html.spec.whatwg.org/multipage/parsing.html#stack-of-open-elements
class StackOfOpenElements {
public:
// Initially, the stack of open elements is empty.
// The stack grows downwards; the topmost node on the stack is the first one added to the stack,
// and the bottommost node of the stack is the most recently added node in the stack
// (notwithstanding when the stack is manipulated in a random access fashion as part of the handling for misnested tags).
StackOfOpenElements() = default;
~StackOfOpenElements();
DOM::Element& first() { return *m_elements.first(); }
DOM::Element& last() { return *m_elements.last(); }
bool is_empty() const { return m_elements.is_empty(); }
void push(JS::NonnullGCPtr<DOM::Element> element) { m_elements.append(element); }
JS::NonnullGCPtr<DOM::Element> pop() { return *m_elements.take_last(); }
void remove(DOM::Element const& element);
void replace(DOM::Element const& to_remove, JS::NonnullGCPtr<DOM::Element> to_add);
void insert_immediately_below(JS::NonnullGCPtr<DOM::Element> element_to_add, DOM::Element const& target);
const DOM::Element& current_node() const { return *m_elements.last(); }
DOM::Element& current_node() { return *m_elements.last(); }
bool has_in_scope(FlyString const& tag_name) const;
bool has_in_button_scope(FlyString const& tag_name) const;
bool has_in_table_scope(FlyString const& tag_name) const;
bool has_in_list_item_scope(FlyString const& tag_name) const;
bool has_in_select_scope(FlyString const& tag_name) const;
bool has_in_scope(const DOM::Element&) const;
bool contains(const DOM::Element&) const;
[[nodiscard]] bool contains_template_element() const;
auto const& elements() const { return m_elements; }
auto& elements() { return m_elements; }
void pop_until_an_element_with_tag_name_has_been_popped(FlyString const& local_name);
JS::GCPtr<DOM::Element> topmost_special_node_below(DOM::Element const&);
struct LastElementResult {
JS::GCPtr<DOM::Element> element;
ssize_t index;
};
LastElementResult last_element_with_tag_name(FlyString const&);
JS::GCPtr<DOM::Element> element_immediately_above(DOM::Element const&);
void visit_edges(JS::Cell::Visitor&);
private:
bool has_in_scope_impl(FlyString const& tag_name, Vector<FlyString> const&) const;
bool has_in_scope_impl(const DOM::Element& target_node, Vector<FlyString> const&) const;
Vector<JS::NonnullGCPtr<DOM::Element>> m_elements;
};
}