mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-09-04 08:36:12 +00:00
AK: Allow indexing at length in Utf8View::byte_offset_of()
And do the same for Utf8View::code_point_offset_of(). Some of these `VERIFY`s of the view's length were introduced recently, but they caused the parsing of named capture groups in RegexParser to crash in some situations. Instead, allow indexing at the view's length: the byte offset of code point `length()` is known, even though that code point does not exist in the view. Similarly, we know the code point offset at byte offset `byte_length()`. Beyond those offsets, we still crash. Fixes 13 failures in test262's `language/literals/regexp/named-groups`.
This commit is contained in:
parent
3db7d802db
commit
265e278275
Notes:
github-actions[bot]
2025-07-22 13:11:42 +00:00
Author: https://github.com/gmta
Commit: 265e278275
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5558
Reviewed-by: https://github.com/trflynn89 ✅
2 changed files with 18 additions and 5 deletions
|
@ -32,24 +32,25 @@ Utf8CodePointIterator Utf8View::iterator_at_byte_offset_without_validation(size_
|
|||
|
||||
size_t Utf8View::code_point_offset_of(size_t byte_offset) const
|
||||
{
|
||||
VERIFY(byte_offset < byte_length());
|
||||
VERIFY(byte_offset <= byte_length());
|
||||
|
||||
// Fast path: each code point is represented by a single byte.
|
||||
if (length() == byte_length())
|
||||
return byte_offset;
|
||||
|
||||
size_t code_point_offset = 0;
|
||||
for (auto it = begin(); !it.done(); ++it) {
|
||||
if (it.m_ptr > begin_ptr() + byte_offset)
|
||||
auto it = begin();
|
||||
while (!it.done()) {
|
||||
if ((++it).m_ptr > begin_ptr() + byte_offset)
|
||||
break;
|
||||
++code_point_offset;
|
||||
}
|
||||
return code_point_offset - 1;
|
||||
return code_point_offset;
|
||||
}
|
||||
|
||||
size_t Utf8View::byte_offset_of(size_t code_point_offset) const
|
||||
{
|
||||
VERIFY(code_point_offset < length());
|
||||
VERIFY(code_point_offset <= length());
|
||||
|
||||
// Fast path: each code point is represented by a single byte.
|
||||
if (length() == byte_length())
|
||||
|
|
|
@ -345,4 +345,16 @@ TEST_CASE(code_point_offset_of)
|
|||
EXPECT_EQ(1u, view.code_point_offset_of(4));
|
||||
EXPECT_EQ(2u, view.code_point_offset_of(5));
|
||||
EXPECT_EQ(3u, view.code_point_offset_of(6));
|
||||
EXPECT_EQ(4u, view.code_point_offset_of(7));
|
||||
}
|
||||
|
||||
TEST_CASE(byte_offset_of)
|
||||
{
|
||||
Utf8View view { "😂foo"sv };
|
||||
|
||||
EXPECT_EQ(0u, view.byte_offset_of(0));
|
||||
EXPECT_EQ(4u, view.byte_offset_of(1));
|
||||
EXPECT_EQ(5u, view.byte_offset_of(2));
|
||||
EXPECT_EQ(6u, view.byte_offset_of(3));
|
||||
EXPECT_EQ(7u, view.byte_offset_of(4));
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue