mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-09-05 09:06:08 +00:00
AK: Allow indexing at length in Utf8View::byte_offset_of()
And do the same for Utf8View::code_point_offset_of(). Some of these `VERIFY`s of the view's length were introduced recently, but they caused the parsing of named capture groups in RegexParser to crash in some situations. Instead, allow indexing at the view's length: the byte offset of code point `length()` is known, even though that code point does not exist in the view. Similarly, we know the code point offset at byte offset `byte_length()`. Beyond those offsets, we still crash. Fixes 13 failures in test262's `language/literals/regexp/named-groups`.
This commit is contained in:
parent
3db7d802db
commit
265e278275
Notes:
github-actions[bot]
2025-07-22 13:11:42 +00:00
Author: https://github.com/gmta
Commit: 265e278275
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5558
Reviewed-by: https://github.com/trflynn89 ✅
2 changed files with 18 additions and 5 deletions
|
@ -32,24 +32,25 @@ Utf8CodePointIterator Utf8View::iterator_at_byte_offset_without_validation(size_
|
||||||
|
|
||||||
size_t Utf8View::code_point_offset_of(size_t byte_offset) const
|
size_t Utf8View::code_point_offset_of(size_t byte_offset) const
|
||||||
{
|
{
|
||||||
VERIFY(byte_offset < byte_length());
|
VERIFY(byte_offset <= byte_length());
|
||||||
|
|
||||||
// Fast path: each code point is represented by a single byte.
|
// Fast path: each code point is represented by a single byte.
|
||||||
if (length() == byte_length())
|
if (length() == byte_length())
|
||||||
return byte_offset;
|
return byte_offset;
|
||||||
|
|
||||||
size_t code_point_offset = 0;
|
size_t code_point_offset = 0;
|
||||||
for (auto it = begin(); !it.done(); ++it) {
|
auto it = begin();
|
||||||
if (it.m_ptr > begin_ptr() + byte_offset)
|
while (!it.done()) {
|
||||||
|
if ((++it).m_ptr > begin_ptr() + byte_offset)
|
||||||
break;
|
break;
|
||||||
++code_point_offset;
|
++code_point_offset;
|
||||||
}
|
}
|
||||||
return code_point_offset - 1;
|
return code_point_offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t Utf8View::byte_offset_of(size_t code_point_offset) const
|
size_t Utf8View::byte_offset_of(size_t code_point_offset) const
|
||||||
{
|
{
|
||||||
VERIFY(code_point_offset < length());
|
VERIFY(code_point_offset <= length());
|
||||||
|
|
||||||
// Fast path: each code point is represented by a single byte.
|
// Fast path: each code point is represented by a single byte.
|
||||||
if (length() == byte_length())
|
if (length() == byte_length())
|
||||||
|
|
|
@ -345,4 +345,16 @@ TEST_CASE(code_point_offset_of)
|
||||||
EXPECT_EQ(1u, view.code_point_offset_of(4));
|
EXPECT_EQ(1u, view.code_point_offset_of(4));
|
||||||
EXPECT_EQ(2u, view.code_point_offset_of(5));
|
EXPECT_EQ(2u, view.code_point_offset_of(5));
|
||||||
EXPECT_EQ(3u, view.code_point_offset_of(6));
|
EXPECT_EQ(3u, view.code_point_offset_of(6));
|
||||||
|
EXPECT_EQ(4u, view.code_point_offset_of(7));
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_CASE(byte_offset_of)
|
||||||
|
{
|
||||||
|
Utf8View view { "😂foo"sv };
|
||||||
|
|
||||||
|
EXPECT_EQ(0u, view.byte_offset_of(0));
|
||||||
|
EXPECT_EQ(4u, view.byte_offset_of(1));
|
||||||
|
EXPECT_EQ(5u, view.byte_offset_of(2));
|
||||||
|
EXPECT_EQ(6u, view.byte_offset_of(3));
|
||||||
|
EXPECT_EQ(7u, view.byte_offset_of(4));
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue