LibWeb: Iterate over text chunks using a grapheme-aware segmenter

Our current text iterator is not aware of multi-code point graphemes. Instead of simply incrementing an iterator one code point at a time, use our Unicode grapheme segmenter to break text into fragments.
Author: https://github.com/trflynn89 Commit: f0105b473b Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/1465
2025-08-28 13:18:19 +00:00 · 2024-09-20 17:14:14 -04:00 · 2024-09-20 17:14:14 -04:00 · f0105b473b · 2024-09-21 06:58:43 +00:00
commit f0105b473b
parent aef85a83bd
5 changed files with 84 additions and 32 deletions
--- a/Userland/Libraries/LibWeb/Layout/TextNode.cpp
+++ b/Userland/Libraries/LibWeb/Layout/TextNode.cpp
@ -391,13 +391,14 @@ void TextNode::compute_text_for_rendering()
    m_text_for_rendering = MUST(builder.to_string());
 }

-TextNode::ChunkIterator::ChunkIterator(StringView text, bool wrap_lines, bool respect_linebreaks, Gfx::FontCascadeList const& font_cascade_list)
+TextNode::ChunkIterator::ChunkIterator(String const& text, bool wrap_lines, bool respect_linebreaks, Gfx::FontCascadeList const& font_cascade_list)
    : m_wrap_lines(wrap_lines)
    , m_respect_linebreaks(respect_linebreaks)
    , m_utf8_view(text)
-    , m_iterator(m_utf8_view.begin())
    , m_font_cascade_list(font_cascade_list)
+    , m_segmenter(Unicode::Segmenter::create(Unicode::SegmenterGranularity::Grapheme))
 {
+    m_segmenter->set_segmented_text(text);
 }

 static Gfx::GlyphRun::TextType text_type_for_code_point(u32 code_point)
@ -462,75 +463,85 @@ Optional<TextNode::Chunk> TextNode::ChunkIterator::peek(size_t count)

 Optional<TextNode::Chunk> TextNode::ChunkIterator::next_without_peek()
 {
-    if (m_iterator == m_utf8_view.end())
+    if (m_current_index >= m_utf8_view.byte_length())
        return {};

-    auto start_of_chunk = m_iterator;
+    auto current_code_point = [this]() {
+        return *m_utf8_view.iterator_at_byte_offset_without_validation(m_current_index);
+    };
+    auto next_grapheme_boundary = [this]() {
+        return m_segmenter->next_boundary(m_current_index).value_or(m_utf8_view.byte_length());
+    };

-    Gfx::Font const& font = m_font_cascade_list.font_for_code_point(*m_iterator);
-    auto text_type = text_type_for_code_point(*m_iterator);
-    while (m_iterator != m_utf8_view.end()) {
-        if (&font != &m_font_cascade_list.font_for_code_point(*m_iterator)) {
-            if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font, text_type); result.has_value())
+    auto code_point = current_code_point();
+    auto start_of_chunk = m_current_index;
+
+    Gfx::Font const& font = m_font_cascade_list.font_for_code_point(code_point);
+    auto text_type = text_type_for_code_point(code_point);
+
+    while (m_current_index < m_utf8_view.byte_length()) {
+        code_point = current_code_point();
+
+        if (&font != &m_font_cascade_list.font_for_code_point(code_point)) {
+            if (auto result = try_commit_chunk(start_of_chunk, m_current_index, false, font, text_type); result.has_value())
                return result.release_value();
        }

-        if (m_respect_linebreaks && *m_iterator == '\n') {
+        if (m_respect_linebreaks && code_point == '\n') {
            // Newline encountered, and we're supposed to preserve them.
            // If we have accumulated some code points in the current chunk, commit them now and continue with the newline next time.
-            if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font, text_type); result.has_value())
+            if (auto result = try_commit_chunk(start_of_chunk, m_current_index, false, font, text_type); result.has_value())
                return result.release_value();

            // Otherwise, commit the newline!
-            ++m_iterator;
-            auto result = try_commit_chunk(start_of_chunk, m_iterator, true, font, text_type);
+            m_current_index = next_grapheme_boundary();
+            auto result = try_commit_chunk(start_of_chunk, m_current_index, true, font, text_type);
            VERIFY(result.has_value());
            return result.release_value();
        }

        if (m_wrap_lines) {
-            if (text_type != text_type_for_code_point(*m_iterator)) {
-                if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font, text_type); result.has_value())
+            if (text_type != text_type_for_code_point(code_point)) {
+                if (auto result = try_commit_chunk(start_of_chunk, m_current_index, false, font, text_type); result.has_value()) {
                    return result.release_value();
+                }
            }

-            if (is_ascii_space(*m_iterator)) {
+            if (is_ascii_space(code_point)) {
                // Whitespace encountered, and we're allowed to break on whitespace.
                // If we have accumulated some code points in the current chunk, commit them now and continue with the whitespace next time.
-                if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font, text_type); result.has_value())
+                if (auto result = try_commit_chunk(start_of_chunk, m_current_index, false, font, text_type); result.has_value()) {
                    return result.release_value();
+                }

                // Otherwise, commit the whitespace!
-                ++m_iterator;
-                if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font, text_type); result.has_value())
+                m_current_index = next_grapheme_boundary();
+                if (auto result = try_commit_chunk(start_of_chunk, m_current_index, false, font, text_type); result.has_value())
                    return result.release_value();
                continue;
            }
        }

-        ++m_iterator;
+        m_current_index = next_grapheme_boundary();
    }

-    if (start_of_chunk != m_utf8_view.end()) {
+    if (start_of_chunk != m_utf8_view.byte_length()) {
        // Try to output whatever's left at the end of the text node.
-        if (auto result = try_commit_chunk(start_of_chunk, m_utf8_view.end(), false, font, text_type); result.has_value())
+        if (auto result = try_commit_chunk(start_of_chunk, m_utf8_view.byte_length(), false, font, text_type); result.has_value())
            return result.release_value();
    }

    return {};
 }

-Optional<TextNode::Chunk> TextNode::ChunkIterator::try_commit_chunk(Utf8View::Iterator const& start, Utf8View::Iterator const& end, bool has_breaking_newline, Gfx::Font const& font, Gfx::GlyphRun::TextType text_type) const
+Optional<TextNode::Chunk> TextNode::ChunkIterator::try_commit_chunk(size_t start, size_t end, bool has_breaking_newline, Gfx::Font const& font, Gfx::GlyphRun::TextType text_type) const
 {
-    auto byte_offset = m_utf8_view.byte_offset_of(start);
-    auto byte_length = m_utf8_view.byte_offset_of(end) - byte_offset;
-
-    if (byte_length > 0) {
-        auto chunk_view = m_utf8_view.substring_view(byte_offset, byte_length);
+    if (auto byte_length = end - start; byte_length > 0) {
+        auto chunk_view = m_utf8_view.substring_view(start, byte_length);
        return Chunk {
            .view = chunk_view,
            .font = font,
-            .start = byte_offset,
+            .start = start,
            .length = byte_length,
            .has_breaking_newline = has_breaking_newline,
            .is_all_whitespace = is_all_whitespace(chunk_view.as_string()),