LibWeb: Teach more of HTMLTokenizer to stop at the insertion point

In particular, input character lookahead now knows how to stop at the
insertion point marker if needed.

This makes it possible to do amazing things like having document.write()
insert doctypes one character at a time.
This commit is contained in:
Andreas Kling 2024-11-24 10:18:17 +01:00 committed by Andreas Kling
commit 5c70436cb2
Notes: github-actions[bot] 2024-11-24 10:46:43 +00:00
6 changed files with 87 additions and 32 deletions

View file

@ -22,7 +22,7 @@ namespace Web::HTML {
#pragma GCC diagnostic ignored "-Wunused-label" #pragma GCC diagnostic ignored "-Wunused-label"
#define CONSUME_NEXT_INPUT_CHARACTER \ #define CONSUME_NEXT_INPUT_CHARACTER \
current_input_character = next_code_point(); current_input_character = next_code_point(stop_at_insertion_point);
#define SWITCH_TO(new_state) \ #define SWITCH_TO(new_state) \
do { \ do { \
@ -195,7 +195,7 @@ static inline void log_parse_error(SourceLocation const& location = SourceLocati
dbgln_if(TOKENIZER_TRACE_DEBUG, "Parse error (tokenization) {}", location); dbgln_if(TOKENIZER_TRACE_DEBUG, "Parse error (tokenization) {}", location);
} }
Optional<u32> HTMLTokenizer::next_code_point() Optional<u32> HTMLTokenizer::next_code_point(StopAtInsertionPoint stop_at_insertion_point)
{ {
if (m_utf8_iterator == m_utf8_view.end()) if (m_utf8_iterator == m_utf8_view.end())
return {}; return {};
@ -203,11 +203,11 @@ Optional<u32> HTMLTokenizer::next_code_point()
u32 code_point; u32 code_point;
// https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
// https://infra.spec.whatwg.org/#normalize-newlines // https://infra.spec.whatwg.org/#normalize-newlines
if (peek_code_point(0).value_or(0) == '\r' && peek_code_point(1).value_or(0) == '\n') { if (peek_code_point(0, stop_at_insertion_point).value_or(0) == '\r' && peek_code_point(1, stop_at_insertion_point).value_or(0) == '\n') {
// replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point, // replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
skip(2); skip(2);
code_point = '\n'; code_point = '\n';
} else if (peek_code_point(0).value_or(0) == '\r') { } else if (peek_code_point(0, stop_at_insertion_point).value_or(0) == '\r') {
// replace every remaining U+000D CR code point with a U+000A LF code point. // replace every remaining U+000D CR code point with a U+000A LF code point.
skip(1); skip(1);
code_point = '\n'; code_point = '\n';
@ -240,11 +240,16 @@ void HTMLTokenizer::skip(size_t count)
} }
} }
Optional<u32> HTMLTokenizer::peek_code_point(size_t offset) const Optional<u32> HTMLTokenizer::peek_code_point(size_t offset, StopAtInsertionPoint stop_at_insertion_point) const
{ {
auto it = m_utf8_iterator; auto it = m_utf8_iterator;
for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i) for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i)
++it; ++it;
if (stop_at_insertion_point == StopAtInsertionPoint::Yes
&& m_insertion_point.defined
&& m_utf8_view.byte_offset_of(it) >= m_insertion_point.position) {
return {};
}
if (it == m_utf8_view.end()) if (it == m_utf8_view.end())
return {}; return {};
return *it; return *it;
@ -277,7 +282,7 @@ _StartOfFunction:
if (stop_at_insertion_point == StopAtInsertionPoint::Yes && is_insertion_point_reached()) if (stop_at_insertion_point == StopAtInsertionPoint::Yes && is_insertion_point_reached())
return {}; return {};
auto current_input_character = next_code_point(); auto current_input_character = next_code_point(stop_at_insertion_point);
switch (m_state) { switch (m_state) {
// 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state // 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
BEGIN_STATE(Data) BEGIN_STATE(Data)
@ -424,15 +429,31 @@ _StartOfFunction:
BEGIN_STATE(MarkupDeclarationOpen) BEGIN_STATE(MarkupDeclarationOpen)
{ {
DONT_CONSUME_NEXT_INPUT_CHARACTER; DONT_CONSUME_NEXT_INPUT_CHARACTER;
if (consume_next_if_match("--"sv)) {
switch (consume_next_if_match("--"sv, stop_at_insertion_point)) {
case ConsumeNextResult::Consumed:
create_new_token(HTMLToken::Type::Comment); create_new_token(HTMLToken::Type::Comment);
m_current_token.set_start_position({}, nth_last_position(3)); m_current_token.set_start_position({}, nth_last_position(3));
SWITCH_TO(CommentStart); SWITCH_TO(CommentStart);
break;
case ConsumeNextResult::NotConsumed:
break;
case ConsumeNextResult::RanOutOfCharacters:
return {};
} }
if (consume_next_if_match("DOCTYPE"sv, CaseSensitivity::CaseInsensitive)) {
switch (consume_next_if_match("DOCTYPE"sv, stop_at_insertion_point, CaseSensitivity::CaseInsensitive)) {
case ConsumeNextResult::Consumed:
SWITCH_TO(DOCTYPE); SWITCH_TO(DOCTYPE);
break;
case ConsumeNextResult::NotConsumed:
break;
case ConsumeNextResult::RanOutOfCharacters:
return {};
} }
if (consume_next_if_match("[CDATA["sv)) {
switch (consume_next_if_match("[CDATA["sv, stop_at_insertion_point)) {
case ConsumeNextResult::Consumed:
// We keep the parser optional so that syntax highlighting can be lexer-only. // We keep the parser optional so that syntax highlighting can be lexer-only.
// The parser registers itself with the lexer it creates. // The parser registers itself with the lexer it creates.
if (m_parser != nullptr if (m_parser != nullptr
@ -444,6 +465,11 @@ _StartOfFunction:
m_current_builder.append("[CDATA["sv); m_current_builder.append("[CDATA["sv);
SWITCH_TO_WITH_UNCLEAN_BUILDER(BogusComment); SWITCH_TO_WITH_UNCLEAN_BUILDER(BogusComment);
} }
break;
case ConsumeNextResult::NotConsumed:
break;
case ConsumeNextResult::RanOutOfCharacters:
return {};
} }
ANYTHING_ELSE ANYTHING_ELSE
{ {
@ -614,11 +640,29 @@ _StartOfFunction:
} }
ANYTHING_ELSE ANYTHING_ELSE
{ {
if (to_ascii_uppercase(current_input_character.value()) == 'P' && consume_next_if_match("UBLIC"sv, CaseSensitivity::CaseInsensitive)) { if (to_ascii_uppercase(current_input_character.value()) == 'P') {
switch (consume_next_if_match("UBLIC"sv, stop_at_insertion_point, CaseSensitivity::CaseInsensitive)) {
case ConsumeNextResult::Consumed:
SWITCH_TO(AfterDOCTYPEPublicKeyword); SWITCH_TO(AfterDOCTYPEPublicKeyword);
break;
case ConsumeNextResult::NotConsumed:
break;
case ConsumeNextResult::RanOutOfCharacters:
DONT_CONSUME_NEXT_INPUT_CHARACTER;
return {};
} }
if (to_ascii_uppercase(current_input_character.value()) == 'S' && consume_next_if_match("YSTEM"sv, CaseSensitivity::CaseInsensitive)) { }
if (to_ascii_uppercase(current_input_character.value()) == 'S') {
switch (consume_next_if_match("YSTEM"sv, stop_at_insertion_point, CaseSensitivity::CaseInsensitive)) {
case ConsumeNextResult::Consumed:
SWITCH_TO(AfterDOCTYPESystemKeyword); SWITCH_TO(AfterDOCTYPESystemKeyword);
break;
case ConsumeNextResult::NotConsumed:
break;
case ConsumeNextResult::RanOutOfCharacters:
DONT_CONSUME_NEXT_INPUT_CHARACTER;
return {};
}
} }
log_parse_error(); log_parse_error();
m_current_token.ensure_doctype_data().force_quirks = true; m_current_token.ensure_doctype_data().force_quirks = true;
@ -1666,7 +1710,7 @@ _StartOfFunction:
m_temporary_buffer.append(ch); m_temporary_buffer.append(ch);
if (consumed_as_part_of_an_attribute() && !match.value().entity.ends_with(';')) { if (consumed_as_part_of_an_attribute() && !match.value().entity.ends_with(';')) {
auto next_code_point = peek_code_point(0); auto next_code_point = peek_code_point(0, stop_at_insertion_point);
if (next_code_point.has_value() && (next_code_point.value() == '=' || is_ascii_alphanumeric(next_code_point.value()))) { if (next_code_point.has_value() && (next_code_point.value() == '=' || is_ascii_alphanumeric(next_code_point.value()))) {
FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE; FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
SWITCH_TO_RETURN_STATE; SWITCH_TO_RETURN_STATE;
@ -2766,25 +2810,29 @@ _StartOfFunction:
} }
} }
bool HTMLTokenizer::consume_next_if_match(StringView string, CaseSensitivity case_sensitivity) HTMLTokenizer::ConsumeNextResult HTMLTokenizer::consume_next_if_match(StringView string, StopAtInsertionPoint stop_at_insertion_point, CaseSensitivity case_sensitivity)
{ {
for (size_t i = 0; i < string.length(); ++i) { for (size_t i = 0; i < string.length(); ++i) {
auto code_point = peek_code_point(i); auto code_point = peek_code_point(i, stop_at_insertion_point);
if (!code_point.has_value()) if (!code_point.has_value()) {
return false; if (StopAtInsertionPoint::Yes == stop_at_insertion_point) {
return ConsumeNextResult::RanOutOfCharacters;
}
return ConsumeNextResult::NotConsumed;
}
// FIXME: This should be more Unicode-aware. // FIXME: This should be more Unicode-aware.
if (case_sensitivity == CaseSensitivity::CaseInsensitive) { if (case_sensitivity == CaseSensitivity::CaseInsensitive) {
if (code_point.value() < 0x80) { if (code_point.value() < 0x80) {
if (to_ascii_lowercase(code_point.value()) != to_ascii_lowercase(string[i])) if (to_ascii_lowercase(code_point.value()) != to_ascii_lowercase(string[i]))
return false; return ConsumeNextResult::NotConsumed;
continue; continue;
} }
} }
if (code_point.value() != (u32)string[i]) if (code_point.value() != (u32)string[i])
return false; return ConsumeNextResult::NotConsumed;
} }
skip(string.length()); skip(string.length());
return true; return ConsumeNextResult::Consumed;
} }
void HTMLTokenizer::create_new_token(HTMLToken::Type type) void HTMLTokenizer::create_new_token(HTMLToken::Type type)

View file

@ -153,9 +153,16 @@ public:
private: private:
void skip(size_t count); void skip(size_t count);
Optional<u32> next_code_point(); Optional<u32> next_code_point(StopAtInsertionPoint);
Optional<u32> peek_code_point(size_t offset) const; Optional<u32> peek_code_point(size_t offset, StopAtInsertionPoint) const;
bool consume_next_if_match(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive);
enum class ConsumeNextResult {
Consumed,
NotConsumed,
RanOutOfCharacters,
};
[[nodiscard]] ConsumeNextResult consume_next_if_match(StringView, StopAtInsertionPoint, CaseSensitivity = CaseSensitivity::CaseSensitive);
void create_new_token(HTMLToken::Type); void create_new_token(HTMLToken::Type);
bool current_end_tag_token_is_appropriate() const; bool current_end_tag_token_is_appropriate() const;
String consume_current_builder(); String consume_current_builder();

View file

@ -6,6 +6,6 @@ Rerun
Found 1 tests Found 1 tests
1 Fail 1 Pass
Details Details
Result Test Name MessageFail document.write Result Test Name MessagePass document.write

View file

@ -6,6 +6,6 @@ Rerun
Found 1 tests Found 1 tests
1 Fail 1 Pass
Details Details
Result Test Name MessageFail document.write Result Test Name MessagePass document.write

View file

@ -6,6 +6,6 @@ Rerun
Found 1 tests Found 1 tests
1 Fail 1 Pass
Details Details
Result Test Name MessageFail document.write Result Test Name MessagePass document.write

View file

@ -6,6 +6,6 @@ Rerun
Found 1 tests Found 1 tests
1 Fail 1 Pass
Details Details
Result Test Name MessageFail document.write Result Test Name MessagePass document.write