From 265e278275f699859092bcb2743c083591678ff2 Mon Sep 17 00:00:00 2001 From: Jelle Raaijmakers Date: Tue, 22 Jul 2025 11:04:19 +0200 Subject: [PATCH] AK: Allow indexing at length in Utf8View::byte_offset_of() And do the same for Utf8View::code_point_offset_of(). Some of these `VERIFY`s of the view's length were introduced recently, but they caused the parsing of named capture groups in RegexParser to crash in some situations. Instead, allow indexing at the view's length: the byte offset of code point `length()` is known, even though that code point does not exist in the view. Similarly, we know the code point offset at byte offset `byte_length()`. Beyond those offsets, we still crash. Fixes 13 failures in test262's `language/literals/regexp/named-groups`. --- AK/Utf8View.cpp | 11 ++++++----- Tests/AK/TestUtf8View.cpp | 12 ++++++++++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/AK/Utf8View.cpp b/AK/Utf8View.cpp index 18535cefd34..2caa17909bf 100644 --- a/AK/Utf8View.cpp +++ b/AK/Utf8View.cpp @@ -32,24 +32,25 @@ Utf8CodePointIterator Utf8View::iterator_at_byte_offset_without_validation(size_ size_t Utf8View::code_point_offset_of(size_t byte_offset) const { - VERIFY(byte_offset < byte_length()); + VERIFY(byte_offset <= byte_length()); // Fast path: each code point is represented by a single byte. if (length() == byte_length()) return byte_offset; size_t code_point_offset = 0; - for (auto it = begin(); !it.done(); ++it) { - if (it.m_ptr > begin_ptr() + byte_offset) + auto it = begin(); + while (!it.done()) { + if ((++it).m_ptr > begin_ptr() + byte_offset) break; ++code_point_offset; } - return code_point_offset - 1; + return code_point_offset; } size_t Utf8View::byte_offset_of(size_t code_point_offset) const { - VERIFY(code_point_offset < length()); + VERIFY(code_point_offset <= length()); // Fast path: each code point is represented by a single byte. if (length() == byte_length()) diff --git a/Tests/AK/TestUtf8View.cpp b/Tests/AK/TestUtf8View.cpp index 10775410611..839f50f271d 100644 --- a/Tests/AK/TestUtf8View.cpp +++ b/Tests/AK/TestUtf8View.cpp @@ -345,4 +345,16 @@ TEST_CASE(code_point_offset_of) EXPECT_EQ(1u, view.code_point_offset_of(4)); EXPECT_EQ(2u, view.code_point_offset_of(5)); EXPECT_EQ(3u, view.code_point_offset_of(6)); + EXPECT_EQ(4u, view.code_point_offset_of(7)); +} + +TEST_CASE(byte_offset_of) +{ + Utf8View view { "😂foo"sv }; + + EXPECT_EQ(0u, view.byte_offset_of(0)); + EXPECT_EQ(4u, view.byte_offset_of(1)); + EXPECT_EQ(5u, view.byte_offset_of(2)); + EXPECT_EQ(6u, view.byte_offset_of(3)); + EXPECT_EQ(7u, view.byte_offset_of(4)); }