LibWeb: Implement tokenization newline preprocessing

Newline normalization will replace \r and \r\n with \n. The spec specifically states > Before the tokenization stage, the input stream must be preprocessed > by normalizing newlines. wheras this is implemented the processing during the tokenization itself. This should still exhibit the same behaviour, while keeping the tokenization logic in the same place.
Author: https://github.com/ant1441 Commit: d73bb2633c Pull-request: https://github.com/SerenityOS/serenity/pull/12639
2025-07-03 07:32:00 +00:00 · 2022-02-18 22:46:28 +00:00 · 2022-02-18 22:46:28 +00:00 · d73bb2633c · 2024-07-17 18:27:16 +09:00
commit d73bb2633c
parent c6fcdd0f93
1 changed files with 19 additions and 3 deletions
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
@ -187,9 +187,25 @@ Optional<u32> HTMLTokenizer::next_code_point()
 {
    if (m_utf8_iterator == m_utf8_view.end())
        return {};
-    skip(1);
-    dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", (char)*m_prev_utf8_iterator);
-    return *m_prev_utf8_iterator;
+
+    u32 code_point;
+    // https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
+    // https://infra.spec.whatwg.org/#normalize-newlines
+    if (peek_code_point(0).value_or(0) == '\r' && peek_code_point(1).value_or(0) == '\n') {
+        // replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
+        skip(2);
+        code_point = '\n';
+    } else if (peek_code_point(0).value_or(0) == '\r') {
+        // replace every remaining U+000D CR code point with a U+000A LF code point.
+        skip(1);
+        code_point = '\n';
+    } else {
+        skip(1);
+        code_point = *m_prev_utf8_iterator;
+    }
+
+    dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", code_point);
+    return code_point;
 }

 void HTMLTokenizer::skip(size_t count)