mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-07-19 07:22:21 +00:00
LibWeb/HTML: Use lookahead when possible for named character references
When there is an active insertion point, it's necessary to tokenize code-point-by-code-point to handle the case of document.write being used to insert a named character reference one code point at a time. However, when there is no insertion point defined, looking ahead at the input and doing the matching all-at-once is more efficient since it allows: - Avoiding the work done in next_code_point between each code point being matched (leading to better CPU cache usage in theory) - Skipping ahead to the end of the match all at once, which does less work overall than the equivalent number of next_code_point calls (that is, skip(N) does less work than next_code_point called N times) In my benchmarking, this provides a small performance boost (fewer instructions, fewer cpu cycles, fewer branch misses) essentially for free.
This commit is contained in:
parent
f737ec2570
commit
7096a2892e
Notes:
github-actions[bot]
2025-07-04 09:12:34 +00:00
Author: https://github.com/squeek502
Commit: 7096a2892e
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/5297
Reviewed-by: https://github.com/gmta ✅
1 changed files with 32 additions and 8 deletions
|
@ -1696,22 +1696,46 @@ _StartOfFunction:
|
|||
// 13.2.5.73 Named character reference state, https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
|
||||
BEGIN_STATE(NamedCharacterReference)
|
||||
{
|
||||
if (current_input_character.has_value()) {
|
||||
if (m_named_character_reference_matcher.try_consume_code_point(current_input_character.value())) {
|
||||
m_temporary_buffer.append(current_input_character.value());
|
||||
continue;
|
||||
} else {
|
||||
if (stop_at_insertion_point == StopAtInsertionPoint::Yes && is_insertion_point_defined()) {
|
||||
// If there is an insertion point, match code-point-by-code-point to handle the possibility of
|
||||
// document.write being used to insert a named character reference one-code-point-at-a-time.
|
||||
if (current_input_character.has_value()) {
|
||||
if (m_named_character_reference_matcher.try_consume_code_point(current_input_character.value())) {
|
||||
m_temporary_buffer.append(current_input_character.value());
|
||||
continue;
|
||||
} else {
|
||||
DONT_CONSUME_NEXT_INPUT_CHARACTER;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// If there's no insertion point (this is the common case), it is safe to look ahead at the rest
|
||||
// of the input and try to match a named character reference all-at-once. This is worthwhile
|
||||
// because matching all-at-once ends up being more efficient.
|
||||
auto starting_consumed_count = m_temporary_buffer.size();
|
||||
auto remaining_source = m_decoded_input.span().slice(m_prev_offset);
|
||||
|
||||
for (auto const code_point : remaining_source) {
|
||||
if (m_named_character_reference_matcher.try_consume_code_point(code_point)) {
|
||||
m_temporary_buffer.append(code_point);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
auto num_consumed = m_temporary_buffer.size() - starting_consumed_count;
|
||||
if (num_consumed == 0) {
|
||||
DONT_CONSUME_NEXT_INPUT_CHARACTER;
|
||||
} else {
|
||||
skip(num_consumed - 1);
|
||||
}
|
||||
}
|
||||
|
||||
// Only consume the characters within the longest match. It's possible that we've overconsumed code points,
|
||||
// though, so we want to backtrack to the longest match found. For example, `¬indo` (which could still
|
||||
// have lead to `⋵̸`) would need to backtrack back to `¬`),
|
||||
// have lead to `⋵̸`) would need to backtrack back to `¬`.
|
||||
auto overconsumed_code_points = m_named_character_reference_matcher.overconsumed_code_points();
|
||||
if (overconsumed_code_points > 0) {
|
||||
auto current_byte_offset = m_current_offset;
|
||||
restore_to(current_byte_offset - overconsumed_code_points);
|
||||
restore_to(m_current_offset - overconsumed_code_points);
|
||||
m_temporary_buffer.resize_and_keep_capacity(m_temporary_buffer.size() - overconsumed_code_points);
|
||||
}
|
||||
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue