diff --git a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp index c13ce25192b..ab8ea5ffb21 100644 --- a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp +++ b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp @@ -22,7 +22,7 @@ namespace Web::HTML { #pragma GCC diagnostic ignored "-Wunused-label" #define CONSUME_NEXT_INPUT_CHARACTER \ - current_input_character = next_code_point(); + current_input_character = next_code_point(stop_at_insertion_point); #define SWITCH_TO(new_state) \ do { \ @@ -195,7 +195,7 @@ static inline void log_parse_error(SourceLocation const& location = SourceLocati dbgln_if(TOKENIZER_TRACE_DEBUG, "Parse error (tokenization) {}", location); } -Optional HTMLTokenizer::next_code_point() +Optional HTMLTokenizer::next_code_point(StopAtInsertionPoint stop_at_insertion_point) { if (m_utf8_iterator == m_utf8_view.end()) return {}; @@ -203,11 +203,11 @@ Optional HTMLTokenizer::next_code_point() u32 code_point; // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization // https://infra.spec.whatwg.org/#normalize-newlines - if (peek_code_point(0).value_or(0) == '\r' && peek_code_point(1).value_or(0) == '\n') { + if (peek_code_point(0, stop_at_insertion_point).value_or(0) == '\r' && peek_code_point(1, stop_at_insertion_point).value_or(0) == '\n') { // replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point, skip(2); code_point = '\n'; - } else if (peek_code_point(0).value_or(0) == '\r') { + } else if (peek_code_point(0, stop_at_insertion_point).value_or(0) == '\r') { // replace every remaining U+000D CR code point with a U+000A LF code point. skip(1); code_point = '\n'; @@ -240,11 +240,16 @@ void HTMLTokenizer::skip(size_t count) } } -Optional HTMLTokenizer::peek_code_point(size_t offset) const +Optional HTMLTokenizer::peek_code_point(size_t offset, StopAtInsertionPoint stop_at_insertion_point) const { auto it = m_utf8_iterator; for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i) ++it; + if (stop_at_insertion_point == StopAtInsertionPoint::Yes + && m_insertion_point.defined + && m_utf8_view.byte_offset_of(it) >= m_insertion_point.position) { + return {}; + } if (it == m_utf8_view.end()) return {}; return *it; @@ -277,7 +282,7 @@ _StartOfFunction: if (stop_at_insertion_point == StopAtInsertionPoint::Yes && is_insertion_point_reached()) return {}; - auto current_input_character = next_code_point(); + auto current_input_character = next_code_point(stop_at_insertion_point); switch (m_state) { // 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state BEGIN_STATE(Data) @@ -424,15 +429,31 @@ _StartOfFunction: BEGIN_STATE(MarkupDeclarationOpen) { DONT_CONSUME_NEXT_INPUT_CHARACTER; - if (consume_next_if_match("--"sv)) { + + switch (consume_next_if_match("--"sv, stop_at_insertion_point)) { + case ConsumeNextResult::Consumed: create_new_token(HTMLToken::Type::Comment); m_current_token.set_start_position({}, nth_last_position(3)); SWITCH_TO(CommentStart); + break; + case ConsumeNextResult::NotConsumed: + break; + case ConsumeNextResult::RanOutOfCharacters: + return {}; } - if (consume_next_if_match("DOCTYPE"sv, CaseSensitivity::CaseInsensitive)) { + + switch (consume_next_if_match("DOCTYPE"sv, stop_at_insertion_point, CaseSensitivity::CaseInsensitive)) { + case ConsumeNextResult::Consumed: SWITCH_TO(DOCTYPE); + break; + case ConsumeNextResult::NotConsumed: + break; + case ConsumeNextResult::RanOutOfCharacters: + return {}; } - if (consume_next_if_match("[CDATA["sv)) { + + switch (consume_next_if_match("[CDATA["sv, stop_at_insertion_point)) { + case ConsumeNextResult::Consumed: // We keep the parser optional so that syntax highlighting can be lexer-only. // The parser registers itself with the lexer it creates. if (m_parser != nullptr @@ -444,6 +465,11 @@ _StartOfFunction: m_current_builder.append("[CDATA["sv); SWITCH_TO_WITH_UNCLEAN_BUILDER(BogusComment); } + break; + case ConsumeNextResult::NotConsumed: + break; + case ConsumeNextResult::RanOutOfCharacters: + return {}; } ANYTHING_ELSE { @@ -614,11 +640,29 @@ _StartOfFunction: } ANYTHING_ELSE { - if (to_ascii_uppercase(current_input_character.value()) == 'P' && consume_next_if_match("UBLIC"sv, CaseSensitivity::CaseInsensitive)) { - SWITCH_TO(AfterDOCTYPEPublicKeyword); + if (to_ascii_uppercase(current_input_character.value()) == 'P') { + switch (consume_next_if_match("UBLIC"sv, stop_at_insertion_point, CaseSensitivity::CaseInsensitive)) { + case ConsumeNextResult::Consumed: + SWITCH_TO(AfterDOCTYPEPublicKeyword); + break; + case ConsumeNextResult::NotConsumed: + break; + case ConsumeNextResult::RanOutOfCharacters: + DONT_CONSUME_NEXT_INPUT_CHARACTER; + return {}; + } } - if (to_ascii_uppercase(current_input_character.value()) == 'S' && consume_next_if_match("YSTEM"sv, CaseSensitivity::CaseInsensitive)) { - SWITCH_TO(AfterDOCTYPESystemKeyword); + if (to_ascii_uppercase(current_input_character.value()) == 'S') { + switch (consume_next_if_match("YSTEM"sv, stop_at_insertion_point, CaseSensitivity::CaseInsensitive)) { + case ConsumeNextResult::Consumed: + SWITCH_TO(AfterDOCTYPESystemKeyword); + break; + case ConsumeNextResult::NotConsumed: + break; + case ConsumeNextResult::RanOutOfCharacters: + DONT_CONSUME_NEXT_INPUT_CHARACTER; + return {}; + } } log_parse_error(); m_current_token.ensure_doctype_data().force_quirks = true; @@ -1666,7 +1710,7 @@ _StartOfFunction: m_temporary_buffer.append(ch); if (consumed_as_part_of_an_attribute() && !match.value().entity.ends_with(';')) { - auto next_code_point = peek_code_point(0); + auto next_code_point = peek_code_point(0, stop_at_insertion_point); if (next_code_point.has_value() && (next_code_point.value() == '=' || is_ascii_alphanumeric(next_code_point.value()))) { FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE; SWITCH_TO_RETURN_STATE; @@ -2766,25 +2810,29 @@ _StartOfFunction: } } -bool HTMLTokenizer::consume_next_if_match(StringView string, CaseSensitivity case_sensitivity) +HTMLTokenizer::ConsumeNextResult HTMLTokenizer::consume_next_if_match(StringView string, StopAtInsertionPoint stop_at_insertion_point, CaseSensitivity case_sensitivity) { for (size_t i = 0; i < string.length(); ++i) { - auto code_point = peek_code_point(i); - if (!code_point.has_value()) - return false; + auto code_point = peek_code_point(i, stop_at_insertion_point); + if (!code_point.has_value()) { + if (StopAtInsertionPoint::Yes == stop_at_insertion_point) { + return ConsumeNextResult::RanOutOfCharacters; + } + return ConsumeNextResult::NotConsumed; + } // FIXME: This should be more Unicode-aware. if (case_sensitivity == CaseSensitivity::CaseInsensitive) { if (code_point.value() < 0x80) { if (to_ascii_lowercase(code_point.value()) != to_ascii_lowercase(string[i])) - return false; + return ConsumeNextResult::NotConsumed; continue; } } if (code_point.value() != (u32)string[i]) - return false; + return ConsumeNextResult::NotConsumed; } skip(string.length()); - return true; + return ConsumeNextResult::Consumed; } void HTMLTokenizer::create_new_token(HTMLToken::Type type) diff --git a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h index 46bd97f1894..06f235b14f1 100644 --- a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h +++ b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h @@ -153,9 +153,16 @@ public: private: void skip(size_t count); - Optional next_code_point(); - Optional peek_code_point(size_t offset) const; - bool consume_next_if_match(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive); + Optional next_code_point(StopAtInsertionPoint); + Optional peek_code_point(size_t offset, StopAtInsertionPoint) const; + + enum class ConsumeNextResult { + Consumed, + NotConsumed, + RanOutOfCharacters, + }; + [[nodiscard]] ConsumeNextResult consume_next_if_match(StringView, StopAtInsertionPoint, CaseSensitivity = CaseSensitivity::CaseSensitive); + void create_new_token(HTMLToken::Type); bool current_end_tag_token_is_appropriate() const; String consume_current_builder(); diff --git a/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/018.txt b/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/018.txt index 400fa5782b1..49a300c2d27 100644 --- a/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/018.txt +++ b/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/018.txt @@ -6,6 +6,6 @@ Rerun Found 1 tests -1 Fail +1 Pass Details -Result Test Name MessageFail document.write \ No newline at end of file +Result Test Name MessagePass document.write \ No newline at end of file diff --git a/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/034.txt b/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/034.txt index 400fa5782b1..49a300c2d27 100644 --- a/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/034.txt +++ b/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/034.txt @@ -6,6 +6,6 @@ Rerun Found 1 tests -1 Fail +1 Pass Details -Result Test Name MessageFail document.write \ No newline at end of file +Result Test Name MessagePass document.write \ No newline at end of file diff --git a/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/036.txt b/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/036.txt index 400fa5782b1..49a300c2d27 100644 --- a/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/036.txt +++ b/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/036.txt @@ -6,6 +6,6 @@ Rerun Found 1 tests -1 Fail +1 Pass Details -Result Test Name MessageFail document.write \ No newline at end of file +Result Test Name MessagePass document.write \ No newline at end of file diff --git a/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/037.txt b/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/037.txt index 400fa5782b1..49a300c2d27 100644 --- a/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/037.txt +++ b/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/037.txt @@ -6,6 +6,6 @@ Rerun Found 1 tests -1 Fail +1 Pass Details -Result Test Name MessageFail document.write \ No newline at end of file +Result Test Name MessagePass document.write \ No newline at end of file