From 15741350baddf3a5b510eee167a51a8f3b3a7093 Mon Sep 17 00:00:00 2001
From: Milo van der Tier <milo@vdtier.nl>
Date: Sat, 23 Nov 2024 14:46:42 +0100
Subject: [PATCH] LibWeb: Make replaceData create new surrogate pairs

When inserting a new utf-16 surrogate next to an existing surrogate
with replaceData, the surrogates would not get merged correctly into a
single code point. This is because internally the text data is stored
as utf-8, and the two surrogates would be converted seperately. This
has now been fixed by first recreating the whole string in utf-16 and
then converting it back to utf-8.

It's not the most efficient solution, but this fixes at least 6 WPT
subtests.
---
 Libraries/LibWeb/DOM/CharacterData.cpp        | 21 +++---
 .../dom/nodes/CharacterData-surrogates.txt    | 18 +++++
 .../dom/nodes/CharacterData-surrogates.html   | 74 +++++++++++++++++++
 3 files changed, 104 insertions(+), 9 deletions(-)
 create mode 100644 Tests/LibWeb/Text/expected/wpt-import/dom/nodes/CharacterData-surrogates.txt
 create mode 100644 Tests/LibWeb/Text/input/wpt-import/dom/nodes/CharacterData-surrogates.html
diff --git a/Libraries/LibWeb/DOM/CharacterData.cpp b/Libraries/LibWeb/DOM/CharacterData.cpp
index 7fd76e43fe0..66d2a91c04a 100644
--- a/Libraries/LibWeb/DOM/CharacterData.cpp
+++ b/Libraries/LibWeb/DOM/CharacterData.cpp
@@ -72,8 +72,6 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
     Utf16View utf16_view { utf16_data };
     auto length = utf16_view.length_in_code_units();
 
-    auto inserted_data_length_in_utf16_code_units = AK::utf16_code_unit_length_from_utf8(data);
-
     // 2. If offset is greater than length, then throw an "IndexSizeError" DOMException.
     if (offset > length)
         return WebIDL::IndexSizeError::create(realm(), "Replacement offset out of range."_string);
@@ -88,11 +86,16 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
     // 5. Insert data into node’s data after offset code units.
     // 6. Let delete offset be offset + data’s length.
     // 7. Starting from delete offset code units, remove count code units from node’s data.
-    StringBuilder builder;
-    builder.append(MUST(utf16_view.substring_view(0, offset).to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)));
-    builder.append(data);
-    builder.append(MUST(utf16_view.substring_view(offset + count).to_utf8(Utf16View::AllowInvalidCodeUnits::Yes)));
-    m_data = MUST(builder.to_string());
+    auto before_data = utf16_view.substring_view(0, offset);
+    auto inserted_data = MUST(AK::utf8_to_utf16(data));
+    auto after_data = utf16_view.substring_view(offset + count);
+    Utf16Data full_data;
+    full_data.ensure_capacity(before_data.length_in_code_units() + inserted_data.size() + after_data.length_in_code_units());
+    full_data.append(before_data.data(), before_data.length_in_code_units());
+    full_data.extend(inserted_data);
+    full_data.append(after_data.data(), after_data.length_in_code_units());
+    Utf16View full_view { full_data };
+    m_data = MUST(full_view.to_utf8(Utf16View::AllowInvalidCodeUnits::Yes));
 
     // 8. For each live range whose start node is node and start offset is greater than offset but less than or equal to offset plus count, set its start offset to offset.
     for (auto& range : Range::live_ranges()) {
@@ -109,14 +112,14 @@ WebIDL::ExceptionOr<void> CharacterData::replace_data(size_t offset, size_t coun
     // 10. For each live range whose start node is node and start offset is greater than offset plus count, increase its start offset by data’s length and decrease it by count.
     for (auto& range : Range::live_ranges()) {
         if (range->start_container() == this && range->start_offset() > (offset + count))
-            TRY(range->set_start(*range->start_container(), range->start_offset() + inserted_data_length_in_utf16_code_units - count));
+            TRY(range->set_start(*range->start_container(), range->start_offset() + inserted_data.size() - count));
     }
 
     // 11. For each live range whose end node is node and end offset is greater than offset plus count, increase its end offset by data’s length and decrease it by count.
     for (auto& range : Range::live_ranges()) {
         if (range->end_container() == this && range->end_offset() > (offset + count)) {
             // AD-HOC: Clamp offset to the end of the data if it's too large.
-            auto new_offset = min(range->end_offset() + inserted_data_length_in_utf16_code_units - count, length_in_utf16_code_units());
+            auto new_offset = min(range->end_offset() + inserted_data.size() - count, length_in_utf16_code_units());
             TRY(range->set_end(*range->end_container(), new_offset));
         }
     }
diff --git a/Tests/LibWeb/Text/expected/wpt-import/dom/nodes/CharacterData-surrogates.txt b/Tests/LibWeb/Text/expected/wpt-import/dom/nodes/CharacterData-surrogates.txt
new file mode 100644
index 00000000000..f20bda29dcd
--- /dev/null
+++ b/Tests/LibWeb/Text/expected/wpt-import/dom/nodes/CharacterData-surrogates.txt
@@ -0,0 +1,18 @@
+Summary
+
+Harness status: OK
+
+Rerun
+
+Found 8 tests
+
+8 Pass
+Details
+Result	Test Name	MessagePass	Text.substringData() splitting surrogate pairs	
+Pass	Text.replaceData() splitting and creating surrogate pairs	
+Pass	Text.deleteData() splitting and creating surrogate pairs	
+Pass	Text.insertData() splitting and creating surrogate pairs	
+Pass	Comment.substringData() splitting surrogate pairs	
+Pass	Comment.replaceData() splitting and creating surrogate pairs	
+Pass	Comment.deleteData() splitting and creating surrogate pairs	
+Pass	Comment.insertData() splitting and creating surrogate pairs	
\ No newline at end of file
diff --git a/Tests/LibWeb/Text/input/wpt-import/dom/nodes/CharacterData-surrogates.html b/Tests/LibWeb/Text/input/wpt-import/dom/nodes/CharacterData-surrogates.html
new file mode 100644
index 00000000000..b46e7e5c4a2
--- /dev/null
+++ b/Tests/LibWeb/Text/input/wpt-import/dom/nodes/CharacterData-surrogates.html
@@ -0,0 +1,74 @@
+<!DOCTYPE html>
+<meta charset=utf-8>
+<title>Splitting and joining surrogate pairs in CharacterData methods</title>
+<link rel=help href="https://dom.spec.whatwg.org/#dom-characterdata-substringdata">
+<link rel=help href="https://dom.spec.whatwg.org/#dom-characterdata-replacedata">
+<link rel=help href="https://dom.spec.whatwg.org/#dom-characterdata-insertdata">
+<link rel=help href="https://dom.spec.whatwg.org/#dom-characterdata-deletedata">
+<link rel=help href="https://dom.spec.whatwg.org/#dom-characterdata-data">
+<script src="../../resources/testharness.js"></script>
+<script src="../../resources/testharnessreport.js"></script>
+<div id="log"></div>
+<script>
+function testNode(create, type) {
+  test(function() {
+    var node = create()
+    assert_equals(node.data, "test")
+
+    node.data = "🌠 test 🌠 TEST"
+
+    assert_equals(node.substringData(1, 8), "\uDF20 test \uD83C")
+  }, type + ".substringData() splitting surrogate pairs")
+
+  test(function() {
+    var node = create()
+    assert_equals(node.data, "test")
+
+    node.data = "🌠 test 🌠 TEST"
+
+    node.replaceData(1, 4, "--");
+    assert_equals(node.data, "\uD83C--st 🌠 TEST");
+
+    node.replaceData(1, 2, "\uDF1F ");
+    assert_equals(node.data, "🌟 st 🌠 TEST");
+
+    node.replaceData(5, 2, "---");
+    assert_equals(node.data, "🌟 st---\uDF20 TEST");
+
+    node.replaceData(6, 2, " \uD83D");
+    assert_equals(node.data, "🌟 st- 🜠 TEST");
+  }, type + ".replaceData() splitting and creating surrogate pairs")
+
+  test(function() {
+    var node = create()
+    assert_equals(node.data, "test")
+
+    node.data = "🌠 test 🌠 TEST"
+
+    node.deleteData(1, 4);
+    assert_equals(node.data, "\uD83Cst 🌠 TEST");
+
+    node.deleteData(1, 4);
+    assert_equals(node.data, "🌠 TEST");
+  }, type + ".deleteData() splitting and creating surrogate pairs")
+
+  test(function() {
+    var node = create()
+    assert_equals(node.data, "test")
+
+    node.data = "🌠 test 🌠 TEST"
+
+    node.insertData(1, "--");
+    assert_equals(node.data, "\uD83C--\uDF20 test 🌠 TEST");
+
+    node.insertData(1, "\uDF1F ");
+    assert_equals(node.data, "🌟 --\uDF20 test 🌠 TEST");
+
+    node.insertData(5, " \uD83D");
+    assert_equals(node.data, "🌟 -- 🜠 test 🌠 TEST");
+  }, type + ".insertData() splitting and creating surrogate pairs")
+}
+
+testNode(function() { return document.createTextNode("test") }, "Text")
+testNode(function() { return document.createComment("test") }, "Comment")
+</script>