LibWeb: Layout text chunks based on their Unicode direction

Append text chunks to either the start or end of the text fragment, depending on the text direction. The direction is determined by what script its code points are from.
Author: https://github.com/BenJilks Commit: 11e7d72686 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/1133 Reviewed-by: https://github.com/diegoiast
2025-08-07 08:39:22 +00:00 · 2024-08-18 17:58:05 +01:00 · 2024-08-18 17:58:05 +01:00 · 11e7d72686 · 2024-08-31 09:50:41 +00:00
commit 11e7d72686
parent 2f5b070716
18 changed files with 460 additions and 150 deletions
--- a/Userland/Libraries/LibWeb/Layout/TextNode.cpp
+++ b/Userland/Libraries/LibWeb/Layout/TextNode.cpp
@ -400,7 +400,67 @@ TextNode::ChunkIterator::ChunkIterator(StringView text, bool wrap_lines, bool re
 {
 }

+static Gfx::GlyphRun::TextType text_type_for_code_point(u32 code_point)
+{
+    switch (Unicode::bidirectional_class(code_point)) {
+    case Unicode::BidiClass::WhiteSpaceNeutral:
+
+    case Unicode::BidiClass::BlockSeparator:
+    case Unicode::BidiClass::SegmentSeparator:
+    case Unicode::BidiClass::CommonNumberSeparator:
+    case Unicode::BidiClass::DirNonSpacingMark:
+
+    case Unicode::BidiClass::ArabicNumber:
+    case Unicode::BidiClass::EuropeanNumber:
+    case Unicode::BidiClass::EuropeanNumberSeparator:
+    case Unicode::BidiClass::EuropeanNumberTerminator:
+        return Gfx::GlyphRun::TextType::ContextDependent;
+
+    case Unicode::BidiClass::BoundaryNeutral:
+    case Unicode::BidiClass::OtherNeutral:
+    case Unicode::BidiClass::FirstStrongIsolate:
+    case Unicode::BidiClass::PopDirectionalFormat:
+    case Unicode::BidiClass::PopDirectionalIsolate:
+        return Gfx::GlyphRun::TextType::Common;
+
+    case Unicode::BidiClass::LeftToRight:
+    case Unicode::BidiClass::LeftToRightEmbedding:
+    case Unicode::BidiClass::LeftToRightIsolate:
+    case Unicode::BidiClass::LeftToRightOverride:
+        return Gfx::GlyphRun::TextType::Ltr;
+
+    case Unicode::BidiClass::RightToLeft:
+    case Unicode::BidiClass::RightToLeftArabic:
+    case Unicode::BidiClass::RightToLeftEmbedding:
+    case Unicode::BidiClass::RightToLeftIsolate:
+    case Unicode::BidiClass::RightToLeftOverride:
+        return Gfx::GlyphRun::TextType::Rtl;
+
+    default:
+        VERIFY_NOT_REACHED();
+    }
+}
+
 Optional<TextNode::Chunk> TextNode::ChunkIterator::next()
+{
+    if (!m_peek_queue.is_empty())
+        return m_peek_queue.take_first();
+    return next_without_peek();
+}
+
+Optional<TextNode::Chunk> TextNode::ChunkIterator::peek(size_t count)
+{
+    while (m_peek_queue.size() <= count) {
+        auto next = next_without_peek();
+        if (!next.has_value())
+            return {};
+        m_peek_queue.append(*next);
+    }
+
+    return m_peek_queue[count];
+}
+
+Optional<TextNode::Chunk> TextNode::ChunkIterator::next_without_peek()
 {
    if (m_iterator == m_utf8_view.end())
        return {};
@ -408,35 +468,41 @@ Optional<TextNode::Chunk> TextNode::ChunkIterator::next()
    auto start_of_chunk = m_iterator;

    Gfx::Font const& font = m_font_cascade_list.font_for_code_point(*m_iterator);
+    auto text_type = text_type_for_code_point(*m_iterator);
    while (m_iterator != m_utf8_view.end()) {
        if (&font != &m_font_cascade_list.font_for_code_point(*m_iterator)) {
-            if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font); result.has_value())
+            if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font, text_type); result.has_value())
                return result.release_value();
        }

        if (m_respect_linebreaks && *m_iterator == '\n') {
            // Newline encountered, and we're supposed to preserve them.
            // If we have accumulated some code points in the current chunk, commit them now and continue with the newline next time.
-            if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font); result.has_value())
+            if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font, text_type); result.has_value())
                return result.release_value();

            // Otherwise, commit the newline!
            ++m_iterator;
-            auto result = try_commit_chunk(start_of_chunk, m_iterator, true, font);
+            auto result = try_commit_chunk(start_of_chunk, m_iterator, true, font, text_type);
            VERIFY(result.has_value());
            return result.release_value();
        }

        if (m_wrap_lines) {
+            if (text_type != text_type_for_code_point(*m_iterator)) {
+                if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font, text_type); result.has_value())
+                    return result.release_value();
+            }
+
            if (is_ascii_space(*m_iterator)) {
                // Whitespace encountered, and we're allowed to break on whitespace.
                // If we have accumulated some code points in the current chunk, commit them now and continue with the whitespace next time.
-                if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font); result.has_value())
+                if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font, text_type); result.has_value())
                    return result.release_value();

                // Otherwise, commit the whitespace!
                ++m_iterator;
-                if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font); result.has_value())
+                if (auto result = try_commit_chunk(start_of_chunk, m_iterator, false, font, text_type); result.has_value())
                    return result.release_value();
                continue;
            }
@ -447,14 +513,14 @@ Optional<TextNode::Chunk> TextNode::ChunkIterator::next()

    if (start_of_chunk != m_utf8_view.end()) {
        // Try to output whatever's left at the end of the text node.
-        if (auto result = try_commit_chunk(start_of_chunk, m_utf8_view.end(), false, font); result.has_value())
+        if (auto result = try_commit_chunk(start_of_chunk, m_utf8_view.end(), false, font, text_type); result.has_value())
            return result.release_value();
    }

    return {};
 }

-Optional<TextNode::Chunk> TextNode::ChunkIterator::try_commit_chunk(Utf8View::Iterator const& start, Utf8View::Iterator const& end, bool has_breaking_newline, Gfx::Font const& font) const
+Optional<TextNode::Chunk> TextNode::ChunkIterator::try_commit_chunk(Utf8View::Iterator const& start, Utf8View::Iterator const& end, bool has_breaking_newline, Gfx::Font const& font, Gfx::GlyphRun::TextType text_type) const
 {
    auto byte_offset = m_utf8_view.byte_offset_of(start);
    auto byte_length = m_utf8_view.byte_offset_of(end) - byte_offset;
@ -468,6 +534,7 @@ Optional<TextNode::Chunk> TextNode::ChunkIterator::try_commit_chunk(Utf8View::It
            .length = byte_length,
            .has_breaking_newline = has_breaking_newline,
            .is_all_whitespace = is_all_whitespace(chunk_view.as_string()),
+            .text_type = text_type,
        };
    }