LibWeb: Use a Unicode text segmenter to select words on double-click

We currently use a naive word segmentation, looking for ASCII spaces to mark a word boundary. Use LibUnicode's complete implementation instead.
Author: https://github.com/trflynn89 Commit: 430c9d3e3f Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/1205
2025-09-05 09:06:08 +00:00 · 2024-08-27 08:19:03 -04:00 · 2024-08-27 08:19:03 -04:00 · 430c9d3e3f · 2024-08-31 13:52:32 +00:00
commit 430c9d3e3f
parent 11e7d72686
2 changed files with 19 additions and 20 deletions
--- a/Userland/Libraries/LibWeb/Page/EventHandler.cpp
+++ b/Userland/Libraries/LibWeb/Page/EventHandler.cpp
@ -5,6 +5,7 @@
 * SPDX-License-Identifier: BSD-2-Clause
 */

+#include <LibUnicode/Segmenter.h>
 #include <LibWeb/DOM/Range.h>
 #include <LibWeb/DOM/Text.h>
 #include <LibWeb/HTML/BrowsingContext.h>
@ -685,30 +686,16 @@ bool EventHandler::handle_doubleclick(CSSPixelPoint viewport_position, CSSPixelP
            auto& hit_dom_node = const_cast<DOM::Text&>(verify_cast<DOM::Text>(*hit_paintable.dom_node()));
            auto const& text_for_rendering = hit_paintable.text_for_rendering();

-            int first_word_break_before = [&] {
-                // Start from one before the index position to prevent selecting only spaces between words, caused by the addition below.
-                // This also helps us dealing with cases where index is equal to the string length.
-                for (int i = result->index_in_node - 1; i >= 0; --i) {
-                    if (is_ascii_space(text_for_rendering.bytes_as_string_view()[i])) {
-                        // Don't include the space in the selection
-                        return i + 1;
-                    }
-                }
-                return 0;
-            }();
+            auto& segmenter = word_segmenter();
+            segmenter.set_segmented_text(text_for_rendering);

-            int first_word_break_after = [&] {
-                for (size_t i = result->index_in_node; i < text_for_rendering.bytes().size(); ++i) {
-                    if (is_ascii_space(text_for_rendering.bytes_as_string_view()[i]))
-                        return i;
-                }
-                return text_for_rendering.bytes().size();
-            }();
+            auto previous_boundary = segmenter.previous_boundary(result->index_in_node, Unicode::Segmenter::Inclusive::Yes).value_or(0);
+            auto next_boundary = segmenter.next_boundary(result->index_in_node).value_or(text_for_rendering.byte_count());

            auto& realm = node->document().realm();
-            document.set_cursor_position(DOM::Position::create(realm, hit_dom_node, first_word_break_after));
+            document.set_cursor_position(DOM::Position::create(realm, hit_dom_node, next_boundary));
            if (auto selection = node->document().get_selection()) {
-                (void)selection->set_base_and_extent(hit_dom_node, first_word_break_before, hit_dom_node, first_word_break_after);
+                (void)selection->set_base_and_extent(hit_dom_node, previous_boundary, hit_dom_node, next_boundary);
            }
            update_selection_range_for_input_or_textarea();
        }
@ -1169,4 +1156,11 @@ void EventHandler::update_selection_range_for_input_or_textarea()
        target.value().set_the_selection_range(selection_start, selection_end, direction);
 }

+Unicode::Segmenter& EventHandler::word_segmenter()
+{
+    if (!m_word_segmenter)
+        m_word_segmenter = Unicode::Segmenter::create(Unicode::SegmenterGranularity::Word);
+    return *m_word_segmenter;
+}
+
 }
--- a/Userland/Libraries/LibWeb/Page/EventHandler.h
+++ b/Userland/Libraries/LibWeb/Page/EventHandler.h
@ -12,6 +12,7 @@
 #include <LibGfx/Forward.h>
 #include <LibJS/Heap/Cell.h>
 #include <LibJS/Heap/GCPtr.h>
+#include <LibUnicode/Forward.h>
 #include <LibWeb/Forward.h>
 #include <LibWeb/Page/InputEvent.h>
 #include <LibWeb/PixelUnits.h>
@ -41,6 +42,8 @@ public:

    void visit_edges(JS::Cell::Visitor& visitor) const;

+    Unicode::Segmenter& word_segmenter();
+
 private:
    bool focus_next_element();
    bool focus_previous_element();
@ -74,6 +77,8 @@ private:
    WeakPtr<DOM::EventTarget> m_mousedown_target;

    Optional<CSSPixelPoint> m_mousemove_previous_screen_position;
+
+    OwnPtr<Unicode::Segmenter> m_word_segmenter;
 };

 }