mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-08-13 11:39:43 +00:00
LibWeb: Let HTMLTokenizer walk over code points instead of UTF-8
Instead of using UTF-8 iterators to traverse the HTMLTokenizer input stream one code point at a time, we now do a one-shot conversion up front from the input encoding to a Vector<u32> of Unicode code points. This simplifies the tokenizer logic somewhat, and ends up being faster as well, so win-win. 1.02x speedup on Speedometer 2.1
This commit is contained in:
parent
7cccdb3bcf
commit
263b125782
Notes:
github-actions[bot]
2025-05-10 23:14:16 +00:00
Author: https://github.com/awesomekling
Commit: 263b125782
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/4687
3 changed files with 53 additions and 54 deletions
|
@ -268,7 +268,7 @@ void HTMLParser::run(HTMLTokenizer::StopAtInsertionPoint stop_at_insertion_point
|
||||||
void HTMLParser::run(const URL::URL& url, HTMLTokenizer::StopAtInsertionPoint stop_at_insertion_point)
|
void HTMLParser::run(const URL::URL& url, HTMLTokenizer::StopAtInsertionPoint stop_at_insertion_point)
|
||||||
{
|
{
|
||||||
m_document->set_url(url);
|
m_document->set_url(url);
|
||||||
m_document->set_source(MUST(String::from_byte_string(m_tokenizer.source())));
|
m_document->set_source(m_tokenizer.source());
|
||||||
run(stop_at_insertion_point);
|
run(stop_at_insertion_point);
|
||||||
the_end(*m_document, this);
|
the_end(*m_document, this);
|
||||||
}
|
}
|
||||||
|
|
|
@ -59,7 +59,7 @@ namespace Web::HTML {
|
||||||
will_reconsume_in(m_return_state); \
|
will_reconsume_in(m_return_state); \
|
||||||
m_state = m_return_state; \
|
m_state = m_return_state; \
|
||||||
if (current_input_character.has_value()) \
|
if (current_input_character.has_value()) \
|
||||||
restore_to(m_prev_utf8_iterator); \
|
restore_to(m_prev_offset); \
|
||||||
goto _StartOfFunction; \
|
goto _StartOfFunction; \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
|
@ -97,7 +97,7 @@ namespace Web::HTML {
|
||||||
#define DONT_CONSUME_NEXT_INPUT_CHARACTER \
|
#define DONT_CONSUME_NEXT_INPUT_CHARACTER \
|
||||||
do { \
|
do { \
|
||||||
if (current_input_character.has_value()) \
|
if (current_input_character.has_value()) \
|
||||||
restore_to(m_prev_utf8_iterator); \
|
restore_to(m_prev_offset); \
|
||||||
} while (0)
|
} while (0)
|
||||||
|
|
||||||
#define ON(code_point) \
|
#define ON(code_point) \
|
||||||
|
@ -195,7 +195,7 @@ static inline void log_parse_error(SourceLocation const& location = SourceLocati
|
||||||
|
|
||||||
Optional<u32> HTMLTokenizer::next_code_point(StopAtInsertionPoint stop_at_insertion_point)
|
Optional<u32> HTMLTokenizer::next_code_point(StopAtInsertionPoint stop_at_insertion_point)
|
||||||
{
|
{
|
||||||
if (m_utf8_iterator == m_utf8_view.end())
|
if (m_current_offset >= static_cast<ssize_t>(m_decoded_input.size()))
|
||||||
return {};
|
return {};
|
||||||
|
|
||||||
u32 code_point;
|
u32 code_point;
|
||||||
|
@ -211,7 +211,7 @@ Optional<u32> HTMLTokenizer::next_code_point(StopAtInsertionPoint stop_at_insert
|
||||||
code_point = '\n';
|
code_point = '\n';
|
||||||
} else {
|
} else {
|
||||||
skip(1);
|
skip(1);
|
||||||
code_point = *m_prev_utf8_iterator;
|
code_point = m_decoded_input[m_prev_offset];
|
||||||
}
|
}
|
||||||
|
|
||||||
dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", code_point);
|
dbgln_if(TOKENIZER_TRACE_DEBUG, "(Tokenizer) Next code_point: {}", code_point);
|
||||||
|
@ -223,8 +223,8 @@ void HTMLTokenizer::skip(size_t count)
|
||||||
if (!m_source_positions.is_empty())
|
if (!m_source_positions.is_empty())
|
||||||
m_source_positions.append(m_source_positions.last());
|
m_source_positions.append(m_source_positions.last());
|
||||||
for (size_t i = 0; i < count; ++i) {
|
for (size_t i = 0; i < count; ++i) {
|
||||||
m_prev_utf8_iterator = m_utf8_iterator;
|
m_prev_offset = m_current_offset;
|
||||||
auto code_point = *m_utf8_iterator;
|
auto code_point = m_decoded_input[m_current_offset];
|
||||||
if (!m_source_positions.is_empty()) {
|
if (!m_source_positions.is_empty()) {
|
||||||
if (code_point == '\n') {
|
if (code_point == '\n') {
|
||||||
m_source_positions.last().column = 0;
|
m_source_positions.last().column = 0;
|
||||||
|
@ -233,23 +233,21 @@ void HTMLTokenizer::skip(size_t count)
|
||||||
m_source_positions.last().column++;
|
m_source_positions.last().column++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
++m_utf8_iterator;
|
++m_current_offset;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Optional<u32> HTMLTokenizer::peek_code_point(size_t offset, StopAtInsertionPoint stop_at_insertion_point) const
|
Optional<u32> HTMLTokenizer::peek_code_point(ssize_t offset, StopAtInsertionPoint stop_at_insertion_point) const
|
||||||
{
|
{
|
||||||
auto it = m_utf8_iterator;
|
auto it = m_current_offset + offset;
|
||||||
for (size_t i = 0; i < offset && it != m_utf8_view.end(); ++i)
|
if (it >= static_cast<ssize_t>(m_decoded_input.size()))
|
||||||
++it;
|
return {};
|
||||||
if (stop_at_insertion_point == StopAtInsertionPoint::Yes
|
if (stop_at_insertion_point == StopAtInsertionPoint::Yes
|
||||||
&& m_insertion_point.defined
|
&& m_insertion_point.defined
|
||||||
&& m_utf8_view.byte_offset_of(it) >= m_insertion_point.position) {
|
&& it >= m_insertion_point.position) {
|
||||||
return {};
|
return {};
|
||||||
}
|
}
|
||||||
if (it == m_utf8_view.end())
|
return m_decoded_input[it];
|
||||||
return {};
|
|
||||||
return *it;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
HTMLToken::Position HTMLTokenizer::nth_last_position(size_t n)
|
HTMLToken::Position HTMLTokenizer::nth_last_position(size_t n)
|
||||||
|
@ -1712,10 +1710,8 @@ _StartOfFunction:
|
||||||
// have lead to `⋵̸`) would need to backtrack back to `¬`),
|
// have lead to `⋵̸`) would need to backtrack back to `¬`),
|
||||||
auto overconsumed_code_points = m_named_character_reference_matcher.overconsumed_code_points();
|
auto overconsumed_code_points = m_named_character_reference_matcher.overconsumed_code_points();
|
||||||
if (overconsumed_code_points > 0) {
|
if (overconsumed_code_points > 0) {
|
||||||
auto current_byte_offset = m_utf8_view.byte_offset_of(m_utf8_iterator);
|
auto current_byte_offset = m_current_offset;
|
||||||
// All consumed code points during character reference matching are guaranteed to be
|
restore_to(current_byte_offset - overconsumed_code_points);
|
||||||
// within the ASCII range, so they are always 1 byte wide.
|
|
||||||
restore_to(m_utf8_view.iterator_at_byte_offset_without_validation(current_byte_offset - overconsumed_code_points));
|
|
||||||
m_temporary_buffer.resize_and_keep_capacity(m_temporary_buffer.size() - overconsumed_code_points);
|
m_temporary_buffer.resize_and_keep_capacity(m_temporary_buffer.size() - overconsumed_code_points);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2856,10 +2852,9 @@ void HTMLTokenizer::create_new_token(HTMLToken::Type type)
|
||||||
|
|
||||||
HTMLTokenizer::HTMLTokenizer()
|
HTMLTokenizer::HTMLTokenizer()
|
||||||
{
|
{
|
||||||
m_decoded_input = "";
|
m_decoded_input = {};
|
||||||
m_utf8_view = Utf8View(m_decoded_input);
|
m_current_offset = 0;
|
||||||
m_utf8_iterator = m_utf8_view.begin();
|
m_prev_offset = 0;
|
||||||
m_prev_utf8_iterator = m_utf8_view.begin();
|
|
||||||
m_source_positions.empend(0u, 0u);
|
m_source_positions.empend(0u, 0u);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -2867,30 +2862,35 @@ HTMLTokenizer::HTMLTokenizer(StringView input, ByteString const& encoding)
|
||||||
{
|
{
|
||||||
auto decoder = TextCodec::decoder_for(encoding);
|
auto decoder = TextCodec::decoder_for(encoding);
|
||||||
VERIFY(decoder.has_value());
|
VERIFY(decoder.has_value());
|
||||||
m_decoded_input = decoder->to_utf8(input).release_value_but_fixme_should_propagate_errors().to_byte_string();
|
m_source = MUST(decoder->to_utf8(input));
|
||||||
m_utf8_view = Utf8View(m_decoded_input);
|
m_decoded_input.ensure_capacity(m_source.bytes().size());
|
||||||
m_utf8_iterator = m_utf8_view.begin();
|
for (auto code_point : m_source.code_points())
|
||||||
m_prev_utf8_iterator = m_utf8_view.begin();
|
m_decoded_input.append(code_point);
|
||||||
|
m_current_offset = 0;
|
||||||
|
m_prev_offset = 0;
|
||||||
m_source_positions.empend(0u, 0u);
|
m_source_positions.empend(0u, 0u);
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLTokenizer::insert_input_at_insertion_point(StringView input)
|
void HTMLTokenizer::insert_input_at_insertion_point(StringView input)
|
||||||
{
|
{
|
||||||
auto utf8_iterator_byte_offset = m_utf8_view.byte_offset_of(m_utf8_iterator);
|
Vector<u32> new_decoded_input;
|
||||||
auto prev_utf8_iterator_byte_offset = m_utf8_view.byte_offset_of(m_prev_utf8_iterator);
|
new_decoded_input.ensure_capacity(m_decoded_input.size() + input.length());
|
||||||
|
|
||||||
// FIXME: Implement a InputStream to handle insertion_point and iterators.
|
auto before = m_decoded_input.span().slice(0, m_insertion_point.position);
|
||||||
StringBuilder builder {};
|
new_decoded_input.append(before.data(), before.size());
|
||||||
builder.append(m_decoded_input.substring_view(0, m_insertion_point.position));
|
|
||||||
builder.append(input);
|
|
||||||
builder.append(m_decoded_input.substring_view(m_insertion_point.position));
|
|
||||||
m_decoded_input = builder.to_byte_string();
|
|
||||||
|
|
||||||
m_utf8_view = Utf8View(m_decoded_input);
|
auto utf8_to_insert = MUST(String::from_utf8(input));
|
||||||
m_utf8_iterator = m_utf8_view.iterator_at_byte_offset(utf8_iterator_byte_offset);
|
ssize_t code_points_inserted = 0;
|
||||||
m_prev_utf8_iterator = m_utf8_view.iterator_at_byte_offset(prev_utf8_iterator_byte_offset);
|
for (auto code_point : utf8_to_insert.code_points()) {
|
||||||
|
new_decoded_input.append(code_point);
|
||||||
|
++code_points_inserted;
|
||||||
|
}
|
||||||
|
|
||||||
m_insertion_point.position += input.length();
|
auto after = m_decoded_input.span().slice(m_insertion_point.position);
|
||||||
|
new_decoded_input.append(after.data(), after.size());
|
||||||
|
m_decoded_input = move(new_decoded_input);
|
||||||
|
|
||||||
|
m_insertion_point.position += code_points_inserted;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLTokenizer::insert_eof()
|
void HTMLTokenizer::insert_eof()
|
||||||
|
@ -2944,9 +2944,9 @@ bool HTMLTokenizer::consumed_as_part_of_an_attribute() const
|
||||||
return m_return_state == State::AttributeValueUnquoted || m_return_state == State::AttributeValueSingleQuoted || m_return_state == State::AttributeValueDoubleQuoted;
|
return m_return_state == State::AttributeValueUnquoted || m_return_state == State::AttributeValueSingleQuoted || m_return_state == State::AttributeValueDoubleQuoted;
|
||||||
}
|
}
|
||||||
|
|
||||||
void HTMLTokenizer::restore_to(Utf8CodePointIterator const& new_iterator)
|
void HTMLTokenizer::restore_to(ssize_t new_iterator)
|
||||||
{
|
{
|
||||||
auto diff = m_utf8_iterator - new_iterator;
|
auto diff = m_current_offset - new_iterator;
|
||||||
if (diff > 0) {
|
if (diff > 0) {
|
||||||
for (ssize_t i = 0; i < diff; ++i) {
|
for (ssize_t i = 0; i < diff; ++i) {
|
||||||
if (!m_source_positions.is_empty())
|
if (!m_source_positions.is_empty())
|
||||||
|
@ -2956,7 +2956,7 @@ void HTMLTokenizer::restore_to(Utf8CodePointIterator const& new_iterator)
|
||||||
// Going forwards...?
|
// Going forwards...?
|
||||||
TODO();
|
TODO();
|
||||||
}
|
}
|
||||||
m_utf8_iterator = new_iterator;
|
m_current_offset = new_iterator;
|
||||||
}
|
}
|
||||||
|
|
||||||
String HTMLTokenizer::consume_current_builder()
|
String HTMLTokenizer::consume_current_builder()
|
||||||
|
|
|
@ -11,7 +11,6 @@
|
||||||
#include <AK/StringBuilder.h>
|
#include <AK/StringBuilder.h>
|
||||||
#include <AK/StringView.h>
|
#include <AK/StringView.h>
|
||||||
#include <AK/Types.h>
|
#include <AK/Types.h>
|
||||||
#include <AK/Utf8View.h>
|
|
||||||
#include <LibGC/Ptr.h>
|
#include <LibGC/Ptr.h>
|
||||||
#include <LibWeb/Forward.h>
|
#include <LibWeb/Forward.h>
|
||||||
#include <LibWeb/HTML/Parser/Entities.h>
|
#include <LibWeb/HTML/Parser/Entities.h>
|
||||||
|
@ -129,7 +128,7 @@ public:
|
||||||
void set_blocked(bool b) { m_blocked = b; }
|
void set_blocked(bool b) { m_blocked = b; }
|
||||||
bool is_blocked() const { return m_blocked; }
|
bool is_blocked() const { return m_blocked; }
|
||||||
|
|
||||||
ByteString source() const { return m_decoded_input; }
|
auto const& source() const { return m_source; }
|
||||||
|
|
||||||
void insert_input_at_insertion_point(StringView input);
|
void insert_input_at_insertion_point(StringView input);
|
||||||
void insert_eof();
|
void insert_eof();
|
||||||
|
@ -138,7 +137,7 @@ public:
|
||||||
bool is_insertion_point_defined() const { return m_insertion_point.defined; }
|
bool is_insertion_point_defined() const { return m_insertion_point.defined; }
|
||||||
bool is_insertion_point_reached()
|
bool is_insertion_point_reached()
|
||||||
{
|
{
|
||||||
return m_insertion_point.defined && m_utf8_view.iterator_offset(m_utf8_iterator) >= m_insertion_point.position;
|
return m_insertion_point.defined && m_current_offset >= m_insertion_point.position;
|
||||||
}
|
}
|
||||||
void undefine_insertion_point() { m_insertion_point.defined = false; }
|
void undefine_insertion_point() { m_insertion_point.defined = false; }
|
||||||
void store_insertion_point() { m_old_insertion_point = m_insertion_point; }
|
void store_insertion_point() { m_old_insertion_point = m_insertion_point; }
|
||||||
|
@ -146,7 +145,7 @@ public:
|
||||||
void update_insertion_point()
|
void update_insertion_point()
|
||||||
{
|
{
|
||||||
m_insertion_point.defined = true;
|
m_insertion_point.defined = true;
|
||||||
m_insertion_point.position = m_utf8_view.iterator_offset(m_utf8_iterator);
|
m_insertion_point.position = m_current_offset;
|
||||||
}
|
}
|
||||||
|
|
||||||
// This permanently cuts off the tokenizer input stream.
|
// This permanently cuts off the tokenizer input stream.
|
||||||
|
@ -155,7 +154,7 @@ public:
|
||||||
private:
|
private:
|
||||||
void skip(size_t count);
|
void skip(size_t count);
|
||||||
Optional<u32> next_code_point(StopAtInsertionPoint);
|
Optional<u32> next_code_point(StopAtInsertionPoint);
|
||||||
Optional<u32> peek_code_point(size_t offset, StopAtInsertionPoint) const;
|
Optional<u32> peek_code_point(ssize_t offset, StopAtInsertionPoint) const;
|
||||||
|
|
||||||
enum class ConsumeNextResult {
|
enum class ConsumeNextResult {
|
||||||
Consumed,
|
Consumed,
|
||||||
|
@ -186,7 +185,7 @@ private:
|
||||||
|
|
||||||
bool consumed_as_part_of_an_attribute() const;
|
bool consumed_as_part_of_an_attribute() const;
|
||||||
|
|
||||||
void restore_to(Utf8CodePointIterator const& new_iterator);
|
void restore_to(ssize_t new_iterator);
|
||||||
HTMLToken::Position nth_last_position(size_t n = 0);
|
HTMLToken::Position nth_last_position(size_t n = 0);
|
||||||
|
|
||||||
GC::Ptr<HTMLParser> m_parser;
|
GC::Ptr<HTMLParser> m_parser;
|
||||||
|
@ -196,18 +195,18 @@ private:
|
||||||
|
|
||||||
Vector<u32> m_temporary_buffer;
|
Vector<u32> m_temporary_buffer;
|
||||||
|
|
||||||
ByteString m_decoded_input;
|
String m_source;
|
||||||
|
Vector<u32> m_decoded_input;
|
||||||
|
|
||||||
struct InsertionPoint {
|
struct InsertionPoint {
|
||||||
size_t position { 0 };
|
ssize_t position { 0 };
|
||||||
bool defined { false };
|
bool defined { false };
|
||||||
};
|
};
|
||||||
InsertionPoint m_insertion_point {};
|
InsertionPoint m_insertion_point {};
|
||||||
InsertionPoint m_old_insertion_point {};
|
InsertionPoint m_old_insertion_point {};
|
||||||
|
|
||||||
Utf8View m_utf8_view;
|
ssize_t m_current_offset { 0 };
|
||||||
Utf8CodePointIterator m_utf8_iterator;
|
ssize_t m_prev_offset { 0 };
|
||||||
Utf8CodePointIterator m_prev_utf8_iterator;
|
|
||||||
|
|
||||||
HTMLToken m_current_token;
|
HTMLToken m_current_token;
|
||||||
StringBuilder m_current_builder;
|
StringBuilder m_current_builder;
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue