diff --git a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
index c13ce25192b..ab8ea5ffb21 100644
--- a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
+++ b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
@@ -22,7 +22,7 @@ namespace Web::HTML {
#pragma GCC diagnostic ignored "-Wunused-label"
#define CONSUME_NEXT_INPUT_CHARACTER \
- current_input_character = next_code_point();
+ current_input_character = next_code_point(stop_at_insertion_point);
#define SWITCH_TO(new_state) \
do { \
@@ -195,7 +195,7 @@ static inline void log_parse_error(SourceLocation const& location = SourceLocati
dbgln_if(TOKENIZER_TRACE_DEBUG, "Parse error (tokenization) {}", location);
}
-Optional HTMLTokenizer::next_code_point()
+Optional HTMLTokenizer::next_code_point(StopAtInsertionPoint stop_at_insertion_point)
{
if (m_utf8_iterator == m_utf8_view.end())
return {};
@@ -203,11 +203,11 @@ Optional HTMLTokenizer::next_code_point()
u32 code_point;
// https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
// https://infra.spec.whatwg.org/#normalize-newlines
- if (peek_code_point(0).value_or(0) == '\r' && peek_code_point(1).value_or(0) == '\n') {
+ if (peek_code_point(0, stop_at_insertion_point).value_or(0) == '\r' && peek_code_point(1, stop_at_insertion_point).value_or(0) == '\n') {
// replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
skip(2);
code_point = '\n';
- } else if (peek_code_point(0).value_or(0) == '\r') {
+ } else if (peek_code_point(0, stop_at_insertion_point).value_or(0) == '\r') {
// replace every remaining U+000D CR code point with a U+000A LF code point.
skip(1);
code_point = '\n';
@@ -240,11 +240,16 @@ void HTMLTokenizer::skip(size_t count)
}
}
-Optional HTMLTokenizer::peek_code_point(size_t offset) const
+Optional HTMLTokenizer::peek_code_point(size_t offset, StopAtInsertionPoint stop_at_insertion_point) const
{
auto it = m_utf8_iterator;
for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i)
++it;
+ if (stop_at_insertion_point == StopAtInsertionPoint::Yes
+ && m_insertion_point.defined
+ && m_utf8_view.byte_offset_of(it) >= m_insertion_point.position) {
+ return {};
+ }
if (it == m_utf8_view.end())
return {};
return *it;
@@ -277,7 +282,7 @@ _StartOfFunction:
if (stop_at_insertion_point == StopAtInsertionPoint::Yes && is_insertion_point_reached())
return {};
- auto current_input_character = next_code_point();
+ auto current_input_character = next_code_point(stop_at_insertion_point);
switch (m_state) {
// 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
BEGIN_STATE(Data)
@@ -424,15 +429,31 @@ _StartOfFunction:
BEGIN_STATE(MarkupDeclarationOpen)
{
DONT_CONSUME_NEXT_INPUT_CHARACTER;
- if (consume_next_if_match("--"sv)) {
+
+ switch (consume_next_if_match("--"sv, stop_at_insertion_point)) {
+ case ConsumeNextResult::Consumed:
create_new_token(HTMLToken::Type::Comment);
m_current_token.set_start_position({}, nth_last_position(3));
SWITCH_TO(CommentStart);
+ break;
+ case ConsumeNextResult::NotConsumed:
+ break;
+ case ConsumeNextResult::RanOutOfCharacters:
+ return {};
}
- if (consume_next_if_match("DOCTYPE"sv, CaseSensitivity::CaseInsensitive)) {
+
+ switch (consume_next_if_match("DOCTYPE"sv, stop_at_insertion_point, CaseSensitivity::CaseInsensitive)) {
+ case ConsumeNextResult::Consumed:
SWITCH_TO(DOCTYPE);
+ break;
+ case ConsumeNextResult::NotConsumed:
+ break;
+ case ConsumeNextResult::RanOutOfCharacters:
+ return {};
}
- if (consume_next_if_match("[CDATA["sv)) {
+
+ switch (consume_next_if_match("[CDATA["sv, stop_at_insertion_point)) {
+ case ConsumeNextResult::Consumed:
// We keep the parser optional so that syntax highlighting can be lexer-only.
// The parser registers itself with the lexer it creates.
if (m_parser != nullptr
@@ -444,6 +465,11 @@ _StartOfFunction:
m_current_builder.append("[CDATA["sv);
SWITCH_TO_WITH_UNCLEAN_BUILDER(BogusComment);
}
+ break;
+ case ConsumeNextResult::NotConsumed:
+ break;
+ case ConsumeNextResult::RanOutOfCharacters:
+ return {};
}
ANYTHING_ELSE
{
@@ -614,11 +640,29 @@ _StartOfFunction:
}
ANYTHING_ELSE
{
- if (to_ascii_uppercase(current_input_character.value()) == 'P' && consume_next_if_match("UBLIC"sv, CaseSensitivity::CaseInsensitive)) {
- SWITCH_TO(AfterDOCTYPEPublicKeyword);
+ if (to_ascii_uppercase(current_input_character.value()) == 'P') {
+ switch (consume_next_if_match("UBLIC"sv, stop_at_insertion_point, CaseSensitivity::CaseInsensitive)) {
+ case ConsumeNextResult::Consumed:
+ SWITCH_TO(AfterDOCTYPEPublicKeyword);
+ break;
+ case ConsumeNextResult::NotConsumed:
+ break;
+ case ConsumeNextResult::RanOutOfCharacters:
+ DONT_CONSUME_NEXT_INPUT_CHARACTER;
+ return {};
+ }
}
- if (to_ascii_uppercase(current_input_character.value()) == 'S' && consume_next_if_match("YSTEM"sv, CaseSensitivity::CaseInsensitive)) {
- SWITCH_TO(AfterDOCTYPESystemKeyword);
+ if (to_ascii_uppercase(current_input_character.value()) == 'S') {
+ switch (consume_next_if_match("YSTEM"sv, stop_at_insertion_point, CaseSensitivity::CaseInsensitive)) {
+ case ConsumeNextResult::Consumed:
+ SWITCH_TO(AfterDOCTYPESystemKeyword);
+ break;
+ case ConsumeNextResult::NotConsumed:
+ break;
+ case ConsumeNextResult::RanOutOfCharacters:
+ DONT_CONSUME_NEXT_INPUT_CHARACTER;
+ return {};
+ }
}
log_parse_error();
m_current_token.ensure_doctype_data().force_quirks = true;
@@ -1666,7 +1710,7 @@ _StartOfFunction:
m_temporary_buffer.append(ch);
if (consumed_as_part_of_an_attribute() && !match.value().entity.ends_with(';')) {
- auto next_code_point = peek_code_point(0);
+ auto next_code_point = peek_code_point(0, stop_at_insertion_point);
if (next_code_point.has_value() && (next_code_point.value() == '=' || is_ascii_alphanumeric(next_code_point.value()))) {
FLUSH_CODEPOINTS_CONSUMED_AS_A_CHARACTER_REFERENCE;
SWITCH_TO_RETURN_STATE;
@@ -2766,25 +2810,29 @@ _StartOfFunction:
}
}
-bool HTMLTokenizer::consume_next_if_match(StringView string, CaseSensitivity case_sensitivity)
+HTMLTokenizer::ConsumeNextResult HTMLTokenizer::consume_next_if_match(StringView string, StopAtInsertionPoint stop_at_insertion_point, CaseSensitivity case_sensitivity)
{
for (size_t i = 0; i < string.length(); ++i) {
- auto code_point = peek_code_point(i);
- if (!code_point.has_value())
- return false;
+ auto code_point = peek_code_point(i, stop_at_insertion_point);
+ if (!code_point.has_value()) {
+ if (StopAtInsertionPoint::Yes == stop_at_insertion_point) {
+ return ConsumeNextResult::RanOutOfCharacters;
+ }
+ return ConsumeNextResult::NotConsumed;
+ }
// FIXME: This should be more Unicode-aware.
if (case_sensitivity == CaseSensitivity::CaseInsensitive) {
if (code_point.value() < 0x80) {
if (to_ascii_lowercase(code_point.value()) != to_ascii_lowercase(string[i]))
- return false;
+ return ConsumeNextResult::NotConsumed;
continue;
}
}
if (code_point.value() != (u32)string[i])
- return false;
+ return ConsumeNextResult::NotConsumed;
}
skip(string.length());
- return true;
+ return ConsumeNextResult::Consumed;
}
void HTMLTokenizer::create_new_token(HTMLToken::Type type)
diff --git a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h
index 46bd97f1894..06f235b14f1 100644
--- a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h
+++ b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.h
@@ -153,9 +153,16 @@ public:
private:
void skip(size_t count);
- Optional next_code_point();
- Optional peek_code_point(size_t offset) const;
- bool consume_next_if_match(StringView, CaseSensitivity = CaseSensitivity::CaseSensitive);
+ Optional next_code_point(StopAtInsertionPoint);
+ Optional peek_code_point(size_t offset, StopAtInsertionPoint) const;
+
+ enum class ConsumeNextResult {
+ Consumed,
+ NotConsumed,
+ RanOutOfCharacters,
+ };
+ [[nodiscard]] ConsumeNextResult consume_next_if_match(StringView, StopAtInsertionPoint, CaseSensitivity = CaseSensitivity::CaseSensitive);
+
void create_new_token(HTMLToken::Type);
bool current_end_tag_token_is_appropriate() const;
String consume_current_builder();
diff --git a/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/018.txt b/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/018.txt
index 400fa5782b1..49a300c2d27 100644
--- a/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/018.txt
+++ b/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/018.txt
@@ -6,6 +6,6 @@ Rerun
Found 1 tests
-1 Fail
+1 Pass
Details
-Result Test Name MessageFail document.write
\ No newline at end of file
+Result Test Name MessagePass document.write
\ No newline at end of file
diff --git a/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/034.txt b/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/034.txt
index 400fa5782b1..49a300c2d27 100644
--- a/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/034.txt
+++ b/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/034.txt
@@ -6,6 +6,6 @@ Rerun
Found 1 tests
-1 Fail
+1 Pass
Details
-Result Test Name MessageFail document.write
\ No newline at end of file
+Result Test Name MessagePass document.write
\ No newline at end of file
diff --git a/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/036.txt b/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/036.txt
index 400fa5782b1..49a300c2d27 100644
--- a/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/036.txt
+++ b/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/036.txt
@@ -6,6 +6,6 @@ Rerun
Found 1 tests
-1 Fail
+1 Pass
Details
-Result Test Name MessageFail document.write
\ No newline at end of file
+Result Test Name MessagePass document.write
\ No newline at end of file
diff --git a/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/037.txt b/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/037.txt
index 400fa5782b1..49a300c2d27 100644
--- a/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/037.txt
+++ b/Tests/LibWeb/Text/expected/wpt-import/html/webappapis/dynamic-markup-insertion/document-write/037.txt
@@ -6,6 +6,6 @@ Rerun
Found 1 tests
-1 Fail
+1 Pass
Details
-Result Test Name MessageFail document.write
\ No newline at end of file
+Result Test Name MessagePass document.write
\ No newline at end of file