diff --git a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp index 55ed9bf8d5c..952d772a8b9 100644 --- a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp +++ b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.cpp @@ -1696,22 +1696,46 @@ _StartOfFunction: // 13.2.5.73 Named character reference state, https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state BEGIN_STATE(NamedCharacterReference) { - if (current_input_character.has_value()) { - if (m_named_character_reference_matcher.try_consume_code_point(current_input_character.value())) { - m_temporary_buffer.append(current_input_character.value()); - continue; - } else { + if (stop_at_insertion_point == StopAtInsertionPoint::Yes && is_insertion_point_defined()) { + // If there is an insertion point, match code-point-by-code-point to handle the possibility of + // document.write being used to insert a named character reference one-code-point-at-a-time. + if (current_input_character.has_value()) { + if (m_named_character_reference_matcher.try_consume_code_point(current_input_character.value())) { + m_temporary_buffer.append(current_input_character.value()); + continue; + } else { + DONT_CONSUME_NEXT_INPUT_CHARACTER; + } + } + } else { + // If there's no insertion point (this is the common case), it is safe to look ahead at the rest + // of the input and try to match a named character reference all-at-once. This is worthwhile + // because matching all-at-once ends up being more efficient. + auto starting_consumed_count = m_temporary_buffer.size(); + auto remaining_source = m_decoded_input.span().slice(m_prev_offset); + + for (auto const code_point : remaining_source) { + if (m_named_character_reference_matcher.try_consume_code_point(code_point)) { + m_temporary_buffer.append(code_point); + } else { + break; + } + } + + auto num_consumed = m_temporary_buffer.size() - starting_consumed_count; + if (num_consumed == 0) { DONT_CONSUME_NEXT_INPUT_CHARACTER; + } else { + skip(num_consumed - 1); } } // Only consume the characters within the longest match. It's possible that we've overconsumed code points, // though, so we want to backtrack to the longest match found. For example, `¬indo` (which could still - // have lead to `⋵̸`) would need to backtrack back to `¬`), + // have lead to `⋵̸`) would need to backtrack back to `¬`. auto overconsumed_code_points = m_named_character_reference_matcher.overconsumed_code_points(); if (overconsumed_code_points > 0) { - auto current_byte_offset = m_current_offset; - restore_to(current_byte_offset - overconsumed_code_points); + restore_to(m_current_offset - overconsumed_code_points); m_temporary_buffer.resize_and_keep_capacity(m_temporary_buffer.size() - overconsumed_code_points); }