AK: Allow indexing at length in Utf8View::byte_offset_of()

And do the same for Utf8View::code_point_offset_of(). Some of these `VERIFY`s of the view's length were introduced recently, but they caused the parsing of named capture groups in RegexParser to crash in some situations. Instead, allow indexing at the view's length: the byte offset of code point `length()` is known, even though that code point does not exist in the view. Similarly, we know the code point offset at byte offset `byte_length()`. Beyond those offsets, we still crash. Fixes 13 failures in test262's `language/literals/regexp/named-groups`.
Author: https://github.com/gmta Commit: 265e278275 Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5558 Reviewed-by: https://github.com/trflynn89 ✅
2025-09-04 08:36:12 +00:00 · 2025-07-22 11:04:19 +02:00 · 2025-07-22 11:04:19 +02:00 · 265e278275 · 2025-07-22 13:11:42 +00:00
commit 265e278275
parent 3db7d802db
2 changed files with 18 additions and 5 deletions
--- a/AK/Utf8View.cpp
+++ b/AK/Utf8View.cpp
@ -32,24 +32,25 @@ Utf8CodePointIterator Utf8View::iterator_at_byte_offset_without_validation(size_

 size_t Utf8View::code_point_offset_of(size_t byte_offset) const
 {
-    VERIFY(byte_offset < byte_length());
+    VERIFY(byte_offset <= byte_length());

    // Fast path: each code point is represented by a single byte.
    if (length() == byte_length())
        return byte_offset;

    size_t code_point_offset = 0;
-    for (auto it = begin(); !it.done(); ++it) {
-        if (it.m_ptr > begin_ptr() + byte_offset)
+    auto it = begin();
+    while (!it.done()) {
+        if ((++it).m_ptr > begin_ptr() + byte_offset)
            break;
        ++code_point_offset;
    }
-    return code_point_offset - 1;
+    return code_point_offset;
 }

 size_t Utf8View::byte_offset_of(size_t code_point_offset) const
 {
-    VERIFY(code_point_offset < length());
+    VERIFY(code_point_offset <= length());

    // Fast path: each code point is represented by a single byte.
    if (length() == byte_length())
--- a/Tests/AK/TestUtf8View.cpp
+++ b/Tests/AK/TestUtf8View.cpp
@ -345,4 +345,16 @@ TEST_CASE(code_point_offset_of)
    EXPECT_EQ(1u, view.code_point_offset_of(4));
    EXPECT_EQ(2u, view.code_point_offset_of(5));
    EXPECT_EQ(3u, view.code_point_offset_of(6));
+    EXPECT_EQ(4u, view.code_point_offset_of(7));
+}
+
+TEST_CASE(byte_offset_of)
+{
+    Utf8View view { "😂foo"sv };
+
+    EXPECT_EQ(0u, view.byte_offset_of(0));
+    EXPECT_EQ(4u, view.byte_offset_of(1));
+    EXPECT_EQ(5u, view.byte_offset_of(2));
+    EXPECT_EQ(6u, view.byte_offset_of(3));
+    EXPECT_EQ(7u, view.byte_offset_of(4));
 }