LibWeb: Implement tokenization newline preprocessing

Newline normalization will replace \r and \r\n with \n.

The spec specifically states
> Before the tokenization stage, the input stream must be preprocessed
> by normalizing newlines.
wheras this is implemented the processing during the tokenization
itself.

This should still exhibit the same behaviour, while keeping the
tokenization logic in the same place.
This commit is contained in:
Adam Hodgen 2022-02-18 22:46:28 +00:00 committed by Andreas Kling
parent c6fcdd0f93
commit d73bb2633c
Notes: sideshowbarker 2024-07-17 18:27:16 +09:00

View file

@ -187,9 +187,25 @@ Optional<u32> HTMLTokenizer::next_code_point()
{
if (m_utf8_iterator == m_utf8_view.end())
return {};
skip(1);
dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", (char)*m_prev_utf8_iterator);
return *m_prev_utf8_iterator;
u32 code_point;
// https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
// https://infra.spec.whatwg.org/#normalize-newlines
if (peek_code_point(0).value_or(0) == '\r' && peek_code_point(1).value_or(0) == '\n') {
// replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
skip(2);
code_point = '\n';
} else if (peek_code_point(0).value_or(0) == '\r') {
// replace every remaining U+000D CR code point with a U+000A LF code point.
skip(1);
code_point = '\n';
} else {
skip(1);
code_point = *m_prev_utf8_iterator;
}
dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", code_point);
return code_point;
}
void HTMLTokenizer::skip(size_t count)