LibWeb: Remember when HTML parser should ignore next line feed character

There's a quirk in HTML where the parser should ignore any line feed character immediately following a `pre` or `textarea` start tag. This was working fine when we could peek ahead in the input stream and see the next token, but didn't work in character-at-a-time parsing with document.write(). This commit adds the "can ignore next line feed character" as a parser flag that is maintained across invocations, making it work in this parsing mode as well. 20 new passes in WPT/html/syntax/parsing/ :^)
Author: https://github.com/awesomekling Commit: 550613e526 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/3626 Reviewed-by: https://github.com/shannonbooth
2025-06-18 08:11:53 +00:00 · 2025-02-19 16:54:28 +01:00 · 2025-02-19 16:54:28 +01:00 · 550613e526 · 2025-02-20 13:33:19 +00:00
commit 550613e526
parent 611833429a
11 changed files with 51 additions and 61 deletions
--- a/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp
+++ b/Libraries/LibWeb/HTML/Parser/HTMLParser.cpp
@ -199,6 +199,13 @@ void HTMLParser::run(HTMLTokenizer::StopAtInsertionPoint stop_at_insertion_point

        dbgln_if(HTML_PARSER_DEBUG, "[{}] {}", insertion_mode_name(), token.to_string());

+        if (m_next_line_feed_can_be_ignored) {
+            m_next_line_feed_can_be_ignored = false;
+            if (token.is_character() && token.code_point() == '\n') {
+                continue;
+            }
+        }
+
        // https://html.spec.whatwg.org/multipage/parsing.html#tree-construction-dispatcher
        // As each token is emitted from the tokenizer, the user agent must follow the appropriate steps from the following list, known as the tree construction dispatcher:
        if (m_stack_of_open_elements.is_empty()
@ -2017,20 +2024,13 @@ void HTMLParser::handle_in_body(HTMLToken& token)
        // Insert an HTML element for the token.
        (void)insert_html_element(token);

-        // AD-HOC: We move this step before handling LINE FEED below, to ensure the flag is updated before
-        //         we process the next token. This is necessary due to how we implement token reprocessing.
-        // Set the frameset-ok flag to "not ok".
-        m_frameset_ok = false;
-
        // If the next token is a U+000A LINE FEED (LF) character token,
        // then ignore that token and move on to the next one.
        // (Newlines at the start of pre blocks are ignored as an authoring convenience.)
-        auto next_token = m_tokenizer.next_token();
-        if (next_token.has_value() && next_token.value().is_character() && next_token.value().code_point() == '\n') {
-            // Ignore it.
-        } else if (next_token.has_value()) {
-            process_using_the_rules_for(m_insertion_mode, next_token.value());
-        }
+        m_next_line_feed_can_be_ignored = true;
+
+        // Set the frameset-ok flag to "not ok".
+        m_frameset_ok = false;

        return;
    }
@ -2567,16 +2567,14 @@ void HTMLParser::handle_in_body(HTMLToken& token)
        // 1. Insert an HTML element for the token.
        (void)insert_html_element(token);

-        // FIXME: 2. If the next token is a U+000A LINE FEED (LF) character token, then ignore that token and move on to the next one. (Newlines at the start of textarea elements are ignored as an authoring convenience.)
+        // 2. If the next token is a U+000A LINE FEED (LF) character token,
+        //    then ignore that token and move on to the next one.
+        //    (Newlines at the start of textarea elements are ignored as an authoring convenience.)
+        m_next_line_feed_can_be_ignored = true;

        // 3. Switch the tokenizer to the RCDATA state.
        m_tokenizer.switch_to({}, HTMLTokenizer::State::RCDATA);

-        // If the next token is a U+000A LINE FEED (LF) character token,
-        // then ignore that token and move on to the next one.
-        // (Newlines at the start of pre blocks are ignored as an authoring convenience.)
-        auto next_token = m_tokenizer.next_token();
-
        // 4. Let the original insertion mode be the current insertion mode.
        m_original_insertion_mode = m_insertion_mode;

@ -2585,13 +2583,6 @@ void HTMLParser::handle_in_body(HTMLToken& token)

        // 6. Switch the insertion mode to "text".
        m_insertion_mode = InsertionMode::Text;
-
-        // FIXME: This step is not in the spec.
-        if (next_token.has_value() && next_token.value().is_character() && next_token.value().code_point() == '\n') {
-            // Ignore it.
-        } else if (next_token.has_value()) {
-            process_using_the_rules_for(m_insertion_mode, next_token.value());
-        }
        return;
    }