mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-04-27 06:48:49 +00:00
LibWeb: Remember when HTML parser should ignore next line feed character
There's a quirk in HTML where the parser should ignore any line feed character immediately following a `pre` or `textarea` start tag. This was working fine when we could peek ahead in the input stream and see the next token, but didn't work in character-at-a-time parsing with document.write(). This commit adds the "can ignore next line feed character" as a parser flag that is maintained across invocations, making it work in this parsing mode as well. 20 new passes in WPT/html/syntax/parsing/ :^)
This commit is contained in:
parent
611833429a
commit
550613e526
Notes:
github-actions[bot]
2025-02-20 13:33:19 +00:00
Author: https://github.com/awesomekling
Commit: 550613e526
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/3626
Reviewed-by: https://github.com/shannonbooth
11 changed files with 51 additions and 61 deletions
|
@ -199,6 +199,13 @@ void HTMLParser::run(HTMLTokenizer::StopAtInsertionPoint stop_at_insertion_point
|
|||
|
||||
dbgln_if(HTML_PARSER_DEBUG, "[{}] {}", insertion_mode_name(), token.to_string());
|
||||
|
||||
if (m_next_line_feed_can_be_ignored) {
|
||||
m_next_line_feed_can_be_ignored = false;
|
||||
if (token.is_character() && token.code_point() == '\n') {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// https://html.spec.whatwg.org/multipage/parsing.html#tree-construction-dispatcher
|
||||
// As each token is emitted from the tokenizer, the user agent must follow the appropriate steps from the following list, known as the tree construction dispatcher:
|
||||
if (m_stack_of_open_elements.is_empty()
|
||||
|
@ -2017,20 +2024,13 @@ void HTMLParser::handle_in_body(HTMLToken& token)
|
|||
// Insert an HTML element for the token.
|
||||
(void)insert_html_element(token);
|
||||
|
||||
// AD-HOC: We move this step before handling LINE FEED below, to ensure the flag is updated before
|
||||
// we process the next token. This is necessary due to how we implement token reprocessing.
|
||||
// Set the frameset-ok flag to "not ok".
|
||||
m_frameset_ok = false;
|
||||
|
||||
// If the next token is a U+000A LINE FEED (LF) character token,
|
||||
// then ignore that token and move on to the next one.
|
||||
// (Newlines at the start of pre blocks are ignored as an authoring convenience.)
|
||||
auto next_token = m_tokenizer.next_token();
|
||||
if (next_token.has_value() && next_token.value().is_character() && next_token.value().code_point() == '\n') {
|
||||
// Ignore it.
|
||||
} else if (next_token.has_value()) {
|
||||
process_using_the_rules_for(m_insertion_mode, next_token.value());
|
||||
}
|
||||
m_next_line_feed_can_be_ignored = true;
|
||||
|
||||
// Set the frameset-ok flag to "not ok".
|
||||
m_frameset_ok = false;
|
||||
|
||||
return;
|
||||
}
|
||||
|
@ -2567,16 +2567,14 @@ void HTMLParser::handle_in_body(HTMLToken& token)
|
|||
// 1. Insert an HTML element for the token.
|
||||
(void)insert_html_element(token);
|
||||
|
||||
// FIXME: 2. If the next token is a U+000A LINE FEED (LF) character token, then ignore that token and move on to the next one. (Newlines at the start of textarea elements are ignored as an authoring convenience.)
|
||||
// 2. If the next token is a U+000A LINE FEED (LF) character token,
|
||||
// then ignore that token and move on to the next one.
|
||||
// (Newlines at the start of textarea elements are ignored as an authoring convenience.)
|
||||
m_next_line_feed_can_be_ignored = true;
|
||||
|
||||
// 3. Switch the tokenizer to the RCDATA state.
|
||||
m_tokenizer.switch_to({}, HTMLTokenizer::State::RCDATA);
|
||||
|
||||
// If the next token is a U+000A LINE FEED (LF) character token,
|
||||
// then ignore that token and move on to the next one.
|
||||
// (Newlines at the start of pre blocks are ignored as an authoring convenience.)
|
||||
auto next_token = m_tokenizer.next_token();
|
||||
|
||||
// 4. Let the original insertion mode be the current insertion mode.
|
||||
m_original_insertion_mode = m_insertion_mode;
|
||||
|
||||
|
@ -2585,13 +2583,6 @@ void HTMLParser::handle_in_body(HTMLToken& token)
|
|||
|
||||
// 6. Switch the insertion mode to "text".
|
||||
m_insertion_mode = InsertionMode::Text;
|
||||
|
||||
// FIXME: This step is not in the spec.
|
||||
if (next_token.has_value() && next_token.value().is_character() && next_token.value().code_point() == '\n') {
|
||||
// Ignore it.
|
||||
} else if (next_token.has_value()) {
|
||||
process_using_the_rules_for(m_insertion_mode, next_token.value());
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue