AK: Allow indexing at length in Utf8View::byte_offset_of()

And do the same for Utf8View::code_point_offset_of(). Some of these
`VERIFY`s of the view's length were introduced recently, but they caused
the parsing of named capture groups in RegexParser to crash in some
situations.

Instead, allow indexing at the view's length: the byte offset of code
point `length()` is known, even though that code point does not exist in
the view. Similarly, we know the code point offset at byte offset
`byte_length()`. Beyond those offsets, we still crash.

Fixes 13 failures in test262's `language/literals/regexp/named-groups`.
This commit is contained in:
Jelle Raaijmakers 2025-07-22 11:04:19 +02:00 committed by Tim Flynn
commit 265e278275
Notes: github-actions[bot] 2025-07-22 13:11:42 +00:00
2 changed files with 18 additions and 5 deletions

View file

@ -32,24 +32,25 @@ Utf8CodePointIterator Utf8View::iterator_at_byte_offset_without_validation(size_
size_t Utf8View::code_point_offset_of(size_t byte_offset) const
{
VERIFY(byte_offset < byte_length());
VERIFY(byte_offset <= byte_length());
// Fast path: each code point is represented by a single byte.
if (length() == byte_length())
return byte_offset;
size_t code_point_offset = 0;
for (auto it = begin(); !it.done(); ++it) {
if (it.m_ptr > begin_ptr() + byte_offset)
auto it = begin();
while (!it.done()) {
if ((++it).m_ptr > begin_ptr() + byte_offset)
break;
++code_point_offset;
}
return code_point_offset - 1;
return code_point_offset;
}
size_t Utf8View::byte_offset_of(size_t code_point_offset) const
{
VERIFY(code_point_offset < length());
VERIFY(code_point_offset <= length());
// Fast path: each code point is represented by a single byte.
if (length() == byte_length())

View file

@ -345,4 +345,16 @@ TEST_CASE(code_point_offset_of)
EXPECT_EQ(1u, view.code_point_offset_of(4));
EXPECT_EQ(2u, view.code_point_offset_of(5));
EXPECT_EQ(3u, view.code_point_offset_of(6));
EXPECT_EQ(4u, view.code_point_offset_of(7));
}
TEST_CASE(byte_offset_of)
{
Utf8View view { "😂foo"sv };
EXPECT_EQ(0u, view.byte_offset_of(0));
EXPECT_EQ(4u, view.byte_offset_of(1));
EXPECT_EQ(5u, view.byte_offset_of(2));
EXPECT_EQ(6u, view.byte_offset_of(3));
EXPECT_EQ(7u, view.byte_offset_of(4));
}