From 7aa0165fe75bb94b79bc847484c7913d249ce263 Mon Sep 17 00:00:00 2001
From: Andrew Kaster <andrew@ladybird.org>
Date: Mon, 30 Sep 2024 17:52:30 -0600
Subject: [PATCH] LibWeb: Deduplicate attributes when emitting start and end
 tags

The HTML tokenizer specification says that we're supposed to do this
when leaving the Attribute name or when emitting the token, as
appropriate.

Hopefully 'as appropriate' can mean only when emitting the token, as
that's the easiest place to insert this logic without complicating the
tokenizer any more.
---
 .../html-parser-duplicate-attributes.txt      |  8 +++++
 .../html-parser-duplicate-attributes.html     | 21 ++++++++++++++
 .../LibWeb/HTML/Parser/HTMLToken.cpp          | 29 +++++++++++++++++++
 .../Libraries/LibWeb/HTML/Parser/HTMLToken.h  |  2 ++
 .../LibWeb/HTML/Parser/HTMLTokenizer.cpp      |  3 ++
 5 files changed, 63 insertions(+)
 create mode 100644 Tests/LibWeb/Text/expected/html-parser-duplicate-attributes.txt
 create mode 100644 Tests/LibWeb/Text/input/html-parser-duplicate-attributes.html
diff --git a/Tests/LibWeb/Text/expected/html-parser-duplicate-attributes.txt b/Tests/LibWeb/Text/expected/html-parser-duplicate-attributes.txt
new file mode 100644
index 00000000000..c3a44f872af
--- /dev/null
+++ b/Tests/LibWeb/Text/expected/html-parser-duplicate-attributes.txt
@@ -0,0 +1,8 @@
+    divs[0].id: fred
+divs[0].className: math
+divs[1].id: spaghetti
+divs[1].className: 
+divs[2].getAttribute("grape"): foo
+divs[0].numAttributes: 2
+divs[1].numAttributes: 2
+divs[2].numAttributes: 1
diff --git a/Tests/LibWeb/Text/input/html-parser-duplicate-attributes.html b/Tests/LibWeb/Text/input/html-parser-duplicate-attributes.html
new file mode 100644
index 00000000000..59f7ce04740
--- /dev/null
+++ b/Tests/LibWeb/Text/input/html-parser-duplicate-attributes.html
@@ -0,0 +1,21 @@
+<!DOCTYPE html>
+<script src="include.js"></script>
+<div id="fred"id="barney" class="math"></div>
+<div class class=1"foo" id="spaghetti" id></div>
+<div grape="foo" grape grape="bar" grape grape grape=baz></div>
+<script>
+    test(() => {
+        let divs = document.getElementsByTagName("div");
+
+        // Per the HTML spec, the first attribute wins.
+        println(`divs[0].id: ${divs[0].id}`);
+        println(`divs[0].className: ${divs[0].className}`);
+        println(`divs[1].id: ${divs[1].id}`);
+        println(`divs[1].className: ${divs[1].className}`);
+        println(`divs[2].getAttribute("grape"): ${divs[2].getAttribute("grape")}`);
+
+        println(`divs[0].numAttributes: ${divs[0].attributes.length}`); // 2
+        println(`divs[1].numAttributes: ${divs[1].attributes.length}`); // 2
+        println(`divs[2].numAttributes: ${divs[2].attributes.length}`); // 1
+    });
+</script>
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp
index 27e256d9775..aaed432a741 100644
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.cpp
@@ -4,6 +4,7 @@
  * SPDX-License-Identifier: BSD-2-Clause
  */
 
+#include <AK/HashTable.h>
 #include <LibWeb/HTML/Parser/HTMLToken.h>
 
 namespace Web::HTML {
@@ -73,4 +74,32 @@ String HTMLToken::to_string() const
     return MUST(builder.to_string());
 }
 
+void HTMLToken::normalize_attributes()
+{
+    // From AttributeNameState: https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
+    //
+    // When the user agent leaves the attribute name state (and before emitting the tag token, if appropriate),
+    // the complete attribute's name must be compared to the other attributes on the same token;
+    // if there is already an attribute on the token with the exact same name, then this is a duplicate-attribute
+    // parse error and the new attribute must be removed from the token.
+
+    // NOTE: If an attribute is so removed from a token, it, and the value that gets associated with it, if any,
+    // are never subsequently used by the parser, and are therefore effectively discarded. Removing the attribute
+    // in this way does not change its status as the "current attribute" for the purposes of the tokenizer, however.
+
+    HashTable<FlyString> seen_attributes;
+    auto* ptr = tag_attributes();
+    if (!ptr)
+        return;
+    auto& tag_attributes = *ptr;
+    for (size_t i = 0; i < tag_attributes.size(); ++i) {
+        auto& attribute = tag_attributes[i];
+        if (seen_attributes.set(attribute.local_name, AK::HashSetExistingEntryBehavior::Keep) == AK::HashSetResult::KeptExistingEntry) {
+            // This is a duplicate attribute, remove it.
+            tag_attributes.remove(i);
+            --i;
+        }
+    }
+}
+
 }
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h
index 07341831b24..8e8e74849bb 100644
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLToken.h
@@ -328,6 +328,8 @@ public:
     void set_start_position(Badge<HTMLTokenizer>, Position start_position) { m_start_position = start_position; }
     void set_end_position(Badge<HTMLTokenizer>, Position end_position) { m_end_position = end_position; }
 
+    void normalize_attributes();
+
 private:
     Vector<Attribute> const* tag_attributes() const
     {
diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
index 934abce3ab7..de3dc9c1d6e 100644
--- a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
+++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp
@@ -2863,6 +2863,9 @@ void HTMLTokenizer::will_emit(HTMLToken& token)
 
     auto is_start_or_end_tag = token.type() == HTMLToken::Type::StartTag || token.type() == HTMLToken::Type::EndTag;
     token.set_end_position({}, nth_last_position(is_start_or_end_tag ? 1 : 0));
+
+    if (is_start_or_end_tag)
+        token.normalize_attributes();
 }
 
 bool HTMLTokenizer::current_end_tag_token_is_appropriate() const