mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-04-21 20:15:17 +00:00
AK+LibRegex: Add Utf16View::code_point_at and use it in RegexStringView
The current method of iterating through the string to access a code point hurts performance quite badly for very large strings. The test262 test "RegExp/property-escapes/generated/Any.js" previously took 3 hours to complete; this one change brings it down to under 10 seconds.
This commit is contained in:
parent
bed51d856a
commit
510bbcd8e0
Notes:
sideshowbarker
2024-07-18 07:30:48 +09:00
Author: https://github.com/trflynn89 Commit: https://github.com/SerenityOS/serenity/commit/510bbcd8e0b Pull-request: https://github.com/SerenityOS/serenity/pull/9177
3 changed files with 21 additions and 0 deletions
|
@ -111,6 +111,23 @@ u16 Utf16View::code_unit_at(size_t index) const
|
|||
return m_code_units[index];
|
||||
}
|
||||
|
||||
u32 Utf16View::code_point_at(size_t index) const
|
||||
{
|
||||
VERIFY(index < length_in_code_units());
|
||||
|
||||
u32 code_point = code_unit_at(index);
|
||||
if (!is_high_surrogate(code_point) && !is_low_surrogate(code_point))
|
||||
return code_point;
|
||||
if (is_low_surrogate(code_point) || (index + 1 == length_in_code_units()))
|
||||
return code_point;
|
||||
|
||||
auto second = code_unit_at(index + 1);
|
||||
if (!is_low_surrogate(second))
|
||||
return code_point;
|
||||
|
||||
return decode_surrogate_pair(code_point, second);
|
||||
}
|
||||
|
||||
size_t Utf16View::code_point_offset_of(size_t code_unit_offset) const
|
||||
{
|
||||
size_t code_point_offset = 0;
|
||||
|
|
|
@ -87,6 +87,7 @@ public:
|
|||
|
||||
u16 const* data() const { return m_code_units.data(); }
|
||||
u16 code_unit_at(size_t index) const;
|
||||
u32 code_point_at(size_t index) const;
|
||||
|
||||
size_t code_point_offset_of(size_t code_unit_offset) const;
|
||||
size_t code_unit_offset_of(size_t code_point_offset) const;
|
||||
|
|
|
@ -240,7 +240,10 @@ public:
|
|||
return ch;
|
||||
},
|
||||
[&](Utf32View& view) -> u32 { return view[index]; },
|
||||
[&](Utf16View& view) -> u32 { return view.code_point_at(index); },
|
||||
[&](auto& view) -> u32 {
|
||||
// FIXME: Iterating to the code point is inefficient, particularly for very large
|
||||
// strings. Implement something like code_point_at to Utf8View.
|
||||
size_t i = index;
|
||||
for (auto it = view.begin(); it != view.end(); ++it, --i) {
|
||||
if (i == 0)
|
||||
|
|
Loading…
Add table
Reference in a new issue