mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-05-13 06:32:54 +00:00
LibWeb: Implement tokenization newline preprocessing
Newline normalization will replace \r and \r\n with \n. The spec specifically states > Before the tokenization stage, the input stream must be preprocessed > by normalizing newlines. wheras this is implemented the processing during the tokenization itself. This should still exhibit the same behaviour, while keeping the tokenization logic in the same place.
This commit is contained in:
parent
c6fcdd0f93
commit
d73bb2633c
Notes:
sideshowbarker
2024-07-17 18:27:16 +09:00
Author: https://github.com/ant1441
Commit: d73bb2633c
Pull-request: https://github.com/SerenityOS/serenity/pull/12639
1 changed files with 19 additions and 3 deletions
|
@ -187,9 +187,25 @@ Optional<u32> HTMLTokenizer::next_code_point()
|
|||
{
|
||||
if (m_utf8_iterator == m_utf8_view.end())
|
||||
return {};
|
||||
skip(1);
|
||||
dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", (char)*m_prev_utf8_iterator);
|
||||
return *m_prev_utf8_iterator;
|
||||
|
||||
u32 code_point;
|
||||
// https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
|
||||
// https://infra.spec.whatwg.org/#normalize-newlines
|
||||
if (peek_code_point(0).value_or(0) == '\r' && peek_code_point(1).value_or(0) == '\n') {
|
||||
// replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
|
||||
skip(2);
|
||||
code_point = '\n';
|
||||
} else if (peek_code_point(0).value_or(0) == '\r') {
|
||||
// replace every remaining U+000D CR code point with a U+000A LF code point.
|
||||
skip(1);
|
||||
code_point = '\n';
|
||||
} else {
|
||||
skip(1);
|
||||
code_point = *m_prev_utf8_iterator;
|
||||
}
|
||||
|
||||
dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", code_point);
|
||||
return code_point;
|
||||
}
|
||||
|
||||
void HTMLTokenizer::skip(size_t count)
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue