LibWeb/DOM: Add missing UTF-8 decode without BOM on fragment ID

We were previously crashing instead of using the replacement character on invalid bytes.
Author: https://github.com/shannonbooth Commit: https://github.com/LadybirdBrowser/ladybird/commit/c954d0be274 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/3525 Reviewed-by: https://github.com/tcl3 ✅
2025-04-20 19:45:12 +00:00 · 2025-02-10 15:32:10 +13:00 · 2025-02-10 15:32:10 +13:00 · c954d0be27 · 2025-02-10 09:49:08 +00:00
commit c954d0be27
parent 67f435975b
7 changed files with 137 additions and 2 deletions
--- a/Libraries/LibWeb/DOM/Document.cpp
+++ b/Libraries/LibWeb/DOM/Document.cpp
@ -2451,10 +2451,10 @@ Document::IndicatedPart Document::determine_the_indicated_part() const

    // 5. Let fragmentBytes be the result of percent-decoding fragment.
    // 6. Let decodedFragment be the result of running UTF-8 decode without BOM on fragmentBytes.
-    auto decoded_fragment = URL::percent_decode(*fragment);
+    auto decoded_fragment = String::from_utf8_with_replacement_character(URL::percent_decode(*fragment), String::WithBOMHandling::No);

    // 7. Set potentialIndicatedElement to the result of finding a potential indicated element given document and decodedFragment.
-    potential_indicated_element = find_a_potential_indicated_element(MUST(FlyString::from_deprecated_fly_string(decoded_fragment)));
+    potential_indicated_element = find_a_potential_indicated_element(decoded_fragment);

    // 8. If potentialIndicatedElement is not null, then return potentialIndicatedElement.
    if (potential_indicated_element)
--- a/Tests/LibWeb/Text/expected/wpt-import/html/browsers/browsing-the-web/scroll-to-fragid/fragment-and-encoding-2.txt
+++ b/Tests/LibWeb/Text/expected/wpt-import/html/browsers/browsing-the-web/scroll-to-fragid/fragment-and-encoding-2.txt
@ -0,0 +1,8 @@
+Harness status: OK
+
+Found 3 tests
+
+3 Pass
+Pass	Invalid percent-encoded UTF-8 byte should decode as U+FFFD
+Pass	Percent-encoded UTF-8 BOM followed by invalid UTF-8 byte should decode as U+FEFF U+FFFD
+Pass	Percent-encoded UTF-8 byte sequence for U+FFFD should decode as U+FFFD
--- a/Tests/LibWeb/Text/expected/wpt-import/html/browsers/browsing-the-web/scroll-to-fragid/fragment-and-encoding.txt
+++ b/Tests/LibWeb/Text/expected/wpt-import/html/browsers/browsing-the-web/scroll-to-fragid/fragment-and-encoding.txt
@ -0,0 +1,9 @@
+Harness status: OK
+
+Found 4 tests
+
+4 Pass
+Pass	U+00FF should find U+00FF
+Pass	Percent-encoded UTF-8 BOM should find U+FEFF as BOM is not stripped when decoding
+Pass	%FF should not find U+00FF as decoding it gives U+FFFD
+Pass	Valid UTF-8 + invalid UTF-8 should not be matched to the utf8-decoded former + the isomorphic-decoded latter
--- a/Tests/LibWeb/Text/expected/wpt-import/html/browsers/browsing-the-web/scroll-to-fragid/scroll-frag-non-utf8-encoded-document.txt
+++ b/Tests/LibWeb/Text/expected/wpt-import/html/browsers/browsing-the-web/scroll-to-fragid/scroll-frag-non-utf8-encoded-document.txt
@ -0,0 +1,6 @@
+Harness status: OK
+
+Found 1 tests
+
+1 Pass
+Pass	Fragment Navigation: fragment id should not be found in non UTF8 document
--- a/Tests/LibWeb/Text/input/wpt-import/html/browsers/browsing-the-web/scroll-to-fragid/fragment-and-encoding-2.html
+++ b/Tests/LibWeb/Text/input/wpt-import/html/browsers/browsing-the-web/scroll-to-fragid/fragment-and-encoding-2.html
@ -0,0 +1,41 @@
+<!doctype html>
+<meta charset=windows-1252>
+<title>Fragment navigation: encoding</title>
+<script src="../../../../resources/testharness.js"></script>
+<script src="../../../../resources/testharnessreport.js"></script>
+<div id="log"></div>
+<div style=height:10000px></div>
+<div id=&#xFFFD;></div>
+<div id=&#xFEFF;&#xFFFD;></div>
+<script>
+function goToTop() {
+  location.hash = "top";
+  assert_equals(self.scrollY, 0, "#top");
+}
+
+test(() => {
+  assert_equals(location.hash, "", "Page must be loaded with no hash");
+
+  location.hash = "%C2";
+  assert_equals(location.hash, "#%C2");
+  assert_greater_than(self.scrollY, 1000, "#%C2");
+}, "Invalid percent-encoded UTF-8 byte should decode as U+FFFD");
+
+test(() => {
+  goToTop();
+
+  location.hash = "%EF%BB%BF%C2";
+  assert_equals(location.hash, "#%EF%BB%BF%C2");
+  assert_greater_than(self.scrollY, 1000, "#%EF%BB%BF%C2");
+}, "Percent-encoded UTF-8 BOM followed by invalid UTF-8 byte should decode as U+FEFF U+FFFD");
+
+test(() => {
+  goToTop();
+
+  location.hash = "%EF%BF%BD";
+  assert_equals(location.hash, "#%EF%BF%BD");
+  assert_greater_than(self.scrollY, 1000, "#%EF%BF%BD");
+
+  goToTop();
+}, "Percent-encoded UTF-8 byte sequence for U+FFFD should decode as U+FFFD");
+</script>
--- a/Tests/LibWeb/Text/input/wpt-import/html/browsers/browsing-the-web/scroll-to-fragid/fragment-and-encoding.html
+++ b/Tests/LibWeb/Text/input/wpt-import/html/browsers/browsing-the-web/scroll-to-fragid/fragment-and-encoding.html
@ -0,0 +1,50 @@
+<!doctype html>
+<meta charset=windows-1252>
+<title>Fragment navigation: encoding</title>
+<script src="../../../../resources/testharness.js"></script>
+<script src="../../../../resources/testharnessreport.js"></script>
+<div id="log"></div>
+<div style=height:10000px></div>
+<div id=&#xFF;></div>
+<div id=&#xFEFF;></div>
+<div id=&#x2661;&#x00FF;><div>
+<script>
+function goToTop() {
+  location.hash = "top";
+  assert_equals(self.scrollY, 0, "#top");
+}
+
+test(() => {
+  assert_equals(location.hash, "", "Page must be loaded with no hash");
+
+  location.hash = "\u00FF";
+  assert_equals(location.hash, "#%C3%BF");
+  assert_greater_than(self.scrollY, 1000, "#%C3%BF");
+}, "U+00FF should find U+00FF");
+
+test(() => {
+  goToTop();
+
+  location.hash = "%EF%BB%BF";
+  assert_greater_than(self.scrollY, 1000, "#%EF%BB%BF");
+}, "Percent-encoded UTF-8 BOM should find U+FEFF as BOM is not stripped when decoding");
+
+test(() => {
+  goToTop();
+
+  location.hash = "%FF";
+  assert_equals(self.scrollY, 0, "#%FF");
+}, "%FF should not find U+00FF as decoding it gives U+FFFD");
+
+test(() => {
+  goToTop();
+
+  // U+2661 in UTF-8 + %FF.
+  // Chrome had an issue that the following fragment was decoded as U+2661 U+00FF.
+  // https://github.com/whatwg/html/pull/3111
+  location.hash = "%E2%99%A1%FF";
+  assert_equals(self.scrollY, 0, "%E2%99%A1%FF");
+
+  goToTop();
+}, "Valid UTF-8 + invalid UTF-8 should not be matched to the utf8-decoded former + the isomorphic-decoded latter");
+</script>
--- a/Tests/LibWeb/Text/input/wpt-import/html/browsers/browsing-the-web/scroll-to-fragid/scroll-frag-non-utf8-encoded-document.html
+++ b/Tests/LibWeb/Text/input/wpt-import/html/browsers/browsing-the-web/scroll-to-fragid/scroll-frag-non-utf8-encoded-document.html
@ -0,0 +1,21 @@
+<!doctype html>
+<title>Fragment Navigation: fragment id should not be found in non UTF8 document</title>
+<meta name=timeout content=long>
+<meta http-equiv="Content-Type" content="text/html; charset=gbk"/>
+<script src="../../../../resources/testharness.js"></script>
+<script src="../../../../resources/testharnessreport.js"></script>
+<body>
+<div></div>
+<div id="&#x586f" style="position:absolute; top:100px;"></div>
+<div style="height:200vh;"></div>
+<script>
+async_test(test => {
+  assert_equals(document.characterSet, "GBK", "Document should be GBK encoded");
+  assert_equals(location.hash, "", "Page must be loaded with no hash");
+  location.hash = '%89g';
+  test.step_timeout(() => {
+    assert_equals( document.scrollingElement.scrollTop, 0 );
+    test.done();
+  }, 1);
+});
+</script>