LibWeb/DOM: Add missing UTF-8 decode without BOM on fragment ID

We were previously crashing instead of using the replacement
character on invalid bytes.
This commit is contained in:
Shannon Booth 2025-02-10 15:32:10 +13:00 committed by Tim Ledbetter
parent 67f435975b
commit c954d0be27
Notes: github-actions[bot] 2025-02-10 09:49:08 +00:00
7 changed files with 137 additions and 2 deletions

View file

@ -2451,10 +2451,10 @@ Document::IndicatedPart Document::determine_the_indicated_part() const
// 5. Let fragmentBytes be the result of percent-decoding fragment.
// 6. Let decodedFragment be the result of running UTF-8 decode without BOM on fragmentBytes.
auto decoded_fragment = URL::percent_decode(*fragment);
auto decoded_fragment = String::from_utf8_with_replacement_character(URL::percent_decode(*fragment), String::WithBOMHandling::No);
// 7. Set potentialIndicatedElement to the result of finding a potential indicated element given document and decodedFragment.
potential_indicated_element = find_a_potential_indicated_element(MUST(FlyString::from_deprecated_fly_string(decoded_fragment)));
potential_indicated_element = find_a_potential_indicated_element(decoded_fragment);
// 8. If potentialIndicatedElement is not null, then return potentialIndicatedElement.
if (potential_indicated_element)

View file

@ -0,0 +1,8 @@
Harness status: OK
Found 3 tests
3 Pass
Pass Invalid percent-encoded UTF-8 byte should decode as U+FFFD
Pass Percent-encoded UTF-8 BOM followed by invalid UTF-8 byte should decode as U+FEFF U+FFFD
Pass Percent-encoded UTF-8 byte sequence for U+FFFD should decode as U+FFFD

View file

@ -0,0 +1,9 @@
Harness status: OK
Found 4 tests
4 Pass
Pass U+00FF should find U+00FF
Pass Percent-encoded UTF-8 BOM should find U+FEFF as BOM is not stripped when decoding
Pass %FF should not find U+00FF as decoding it gives U+FFFD
Pass Valid UTF-8 + invalid UTF-8 should not be matched to the utf8-decoded former + the isomorphic-decoded latter

View file

@ -0,0 +1,6 @@
Harness status: OK
Found 1 tests
1 Pass
Pass Fragment Navigation: fragment id should not be found in non UTF8 document

View file

@ -0,0 +1,41 @@
<!doctype html>
<meta charset=windows-1252>
<title>Fragment navigation: encoding</title>
<script src="../../../../resources/testharness.js"></script>
<script src="../../../../resources/testharnessreport.js"></script>
<div id="log"></div>
<div style=height:10000px></div>
<div id=&#xFFFD;></div>
<div id=&#xFEFF;&#xFFFD;></div>
<script>
function goToTop() {
location.hash = "top";
assert_equals(self.scrollY, 0, "#top");
}
test(() => {
assert_equals(location.hash, "", "Page must be loaded with no hash");
location.hash = "%C2";
assert_equals(location.hash, "#%C2");
assert_greater_than(self.scrollY, 1000, "#%C2");
}, "Invalid percent-encoded UTF-8 byte should decode as U+FFFD");
test(() => {
goToTop();
location.hash = "%EF%BB%BF%C2";
assert_equals(location.hash, "#%EF%BB%BF%C2");
assert_greater_than(self.scrollY, 1000, "#%EF%BB%BF%C2");
}, "Percent-encoded UTF-8 BOM followed by invalid UTF-8 byte should decode as U+FEFF U+FFFD");
test(() => {
goToTop();
location.hash = "%EF%BF%BD";
assert_equals(location.hash, "#%EF%BF%BD");
assert_greater_than(self.scrollY, 1000, "#%EF%BF%BD");
goToTop();
}, "Percent-encoded UTF-8 byte sequence for U+FFFD should decode as U+FFFD");
</script>

View file

@ -0,0 +1,50 @@
<!doctype html>
<meta charset=windows-1252>
<title>Fragment navigation: encoding</title>
<script src="../../../../resources/testharness.js"></script>
<script src="../../../../resources/testharnessreport.js"></script>
<div id="log"></div>
<div style=height:10000px></div>
<div id=&#xFF;></div>
<div id=&#xFEFF;></div>
<div id=&#x2661;&#x00FF;><div>
<script>
function goToTop() {
location.hash = "top";
assert_equals(self.scrollY, 0, "#top");
}
test(() => {
assert_equals(location.hash, "", "Page must be loaded with no hash");
location.hash = "\u00FF";
assert_equals(location.hash, "#%C3%BF");
assert_greater_than(self.scrollY, 1000, "#%C3%BF");
}, "U+00FF should find U+00FF");
test(() => {
goToTop();
location.hash = "%EF%BB%BF";
assert_greater_than(self.scrollY, 1000, "#%EF%BB%BF");
}, "Percent-encoded UTF-8 BOM should find U+FEFF as BOM is not stripped when decoding");
test(() => {
goToTop();
location.hash = "%FF";
assert_equals(self.scrollY, 0, "#%FF");
}, "%FF should not find U+00FF as decoding it gives U+FFFD");
test(() => {
goToTop();
// U+2661 in UTF-8 + %FF.
// Chrome had an issue that the following fragment was decoded as U+2661 U+00FF.
// https://github.com/whatwg/html/pull/3111
location.hash = "%E2%99%A1%FF";
assert_equals(self.scrollY, 0, "%E2%99%A1%FF");
goToTop();
}, "Valid UTF-8 + invalid UTF-8 should not be matched to the utf8-decoded former + the isomorphic-decoded latter");
</script>

View file

@ -0,0 +1,21 @@
<!doctype html>
<title>Fragment Navigation: fragment id should not be found in non UTF8 document</title>
<meta name=timeout content=long>
<meta http-equiv="Content-Type" content="text/html; charset=gbk"/>
<script src="../../../../resources/testharness.js"></script>
<script src="../../../../resources/testharnessreport.js"></script>
<body>
<div></div>
<div id="&#x586f" style="position:absolute; top:100px;"></div>
<div style="height:200vh;"></div>
<script>
async_test(test => {
assert_equals(document.characterSet, "GBK", "Document should be GBK encoded");
assert_equals(location.hash, "", "Page must be loaded with no hash");
location.hash = '%89g';
test.step_timeout(() => {
assert_equals( document.scrollingElement.scrollTop, 0 );
test.done();
}, 1);
});
</script>