mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-05-09 04:32:51 +00:00
LibWeb: Start building the tree building part of the new HTML parser
This patch adds a new HTMLDocumentParser class. It keeps a tokenizer object internally and feeds itself with one token at a time from it. The names and idioms in this class are expressed as closely to the actual HTML parsing spec as possible, to make development as easy and bug free as possible. :^) This is going to become pretty large, but it's pretty cool!
This commit is contained in:
parent
0b61e21873
commit
fd1b31d0ff
Notes:
sideshowbarker
2024-07-19 06:12:03 +09:00
Author: https://github.com/awesomekling
Commit: fd1b31d0ff
8 changed files with 515 additions and 76 deletions
|
@ -28,6 +28,8 @@
|
|||
#include <LibWeb/Parser/HTMLTokenizer.h>
|
||||
#include <ctype.h>
|
||||
|
||||
#pragma GCC diagnostic ignored "-Wunused-label"
|
||||
|
||||
//#define TOKENIZER_TRACE
|
||||
|
||||
#define TODO() \
|
||||
|
@ -47,6 +49,11 @@
|
|||
m_state = State::new_state; \
|
||||
goto new_state;
|
||||
|
||||
#define SWITCH_TO_AND_EMIT_CURRENT_TOKEN(new_state) \
|
||||
will_switch_to(State::new_state); \
|
||||
m_state = State::new_state; \
|
||||
return m_current_token;
|
||||
|
||||
#define DONT_CONSUME_NEXT_INPUT_CHARACTER --m_cursor;
|
||||
|
||||
#define ON(codepoint) \
|
||||
|
@ -66,10 +73,12 @@
|
|||
|
||||
#define ANYTHING_ELSE if (1)
|
||||
|
||||
#define EMIT_EOF_AND_RETURN \
|
||||
#define EMIT_EOF \
|
||||
create_new_token(HTMLToken::Type::EndOfFile); \
|
||||
emit_current_token(); \
|
||||
return;
|
||||
return m_current_token;
|
||||
|
||||
#define EMIT_CURRENT_TOKEN \
|
||||
return m_current_token;
|
||||
|
||||
#define BEGIN_STATE(state) \
|
||||
state: \
|
||||
|
@ -100,7 +109,7 @@ Optional<u32> HTMLTokenizer::peek_codepoint(size_t offset) const
|
|||
return m_input[m_cursor + offset];
|
||||
}
|
||||
|
||||
void HTMLTokenizer::run()
|
||||
Optional<HTMLToken> HTMLTokenizer::next_token()
|
||||
{
|
||||
for (;;) {
|
||||
auto current_input_character = next_codepoint();
|
||||
|
@ -118,7 +127,7 @@ void HTMLTokenizer::run()
|
|||
}
|
||||
ON_EOF
|
||||
{
|
||||
EMIT_EOF_AND_RETURN;
|
||||
EMIT_EOF;
|
||||
}
|
||||
ANYTHING_ELSE
|
||||
{
|
||||
|
@ -168,8 +177,7 @@ void HTMLTokenizer::run()
|
|||
}
|
||||
ON('>')
|
||||
{
|
||||
emit_current_token();
|
||||
SWITCH_TO(Data);
|
||||
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
|
||||
}
|
||||
ANYTHING_ELSE
|
||||
{
|
||||
|
@ -266,8 +274,7 @@ void HTMLTokenizer::run()
|
|||
}
|
||||
ON('>')
|
||||
{
|
||||
emit_current_token();
|
||||
SWITCH_TO(Data);
|
||||
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
|
||||
}
|
||||
ON_ASCII_UPPER_ALPHA
|
||||
{
|
||||
|
@ -297,8 +304,7 @@ void HTMLTokenizer::run()
|
|||
}
|
||||
ON('>')
|
||||
{
|
||||
emit_current_token();
|
||||
SWITCH_TO(Data);
|
||||
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
|
||||
}
|
||||
ON_EOF
|
||||
{
|
||||
|
@ -473,8 +479,7 @@ void HTMLTokenizer::run()
|
|||
}
|
||||
ON('>')
|
||||
{
|
||||
emit_current_token();
|
||||
SWITCH_TO(Data);
|
||||
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
|
||||
}
|
||||
ON(0)
|
||||
{
|
||||
|
@ -504,8 +509,7 @@ void HTMLTokenizer::run()
|
|||
}
|
||||
ON('>')
|
||||
{
|
||||
emit_current_token();
|
||||
SWITCH_TO(Data);
|
||||
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
|
||||
}
|
||||
ON_EOF
|
||||
{
|
||||
|
@ -588,8 +592,7 @@ void HTMLTokenizer::run()
|
|||
{
|
||||
ON('>')
|
||||
{
|
||||
emit_current_token();
|
||||
SWITCH_TO(Data);
|
||||
SWITCH_TO_AND_EMIT_CURRENT_TOKEN(Data);
|
||||
}
|
||||
ON('!')
|
||||
{
|
||||
|
@ -741,57 +744,6 @@ bool HTMLTokenizer::next_few_characters_are(const StringView& string) const
|
|||
return true;
|
||||
}
|
||||
|
||||
void HTMLTokenizer::emit_current_token()
|
||||
{
|
||||
StringBuilder builder;
|
||||
|
||||
switch (m_current_token.type()) {
|
||||
case HTMLToken::Type::DOCTYPE:
|
||||
builder.append("DOCTYPE");
|
||||
builder.append(" { name: '");
|
||||
builder.append(m_current_token.m_doctype.name.to_string());
|
||||
builder.append("' }");
|
||||
break;
|
||||
case HTMLToken::Type::StartTag:
|
||||
builder.append("StartTag");
|
||||
break;
|
||||
case HTMLToken::Type::EndTag:
|
||||
builder.append("EndTag");
|
||||
break;
|
||||
case HTMLToken::Type::Comment:
|
||||
builder.append("Comment");
|
||||
break;
|
||||
case HTMLToken::Type::Character:
|
||||
builder.append("Character");
|
||||
break;
|
||||
case HTMLToken::Type::EndOfFile:
|
||||
builder.append("EndOfFile");
|
||||
break;
|
||||
}
|
||||
|
||||
if (m_current_token.type() == HTMLToken::Type::StartTag || m_current_token.type() == HTMLToken::Type::EndTag) {
|
||||
builder.append(" { name: '");
|
||||
builder.append(m_current_token.m_tag.tag_name.to_string());
|
||||
builder.append("', { ");
|
||||
for (auto& attribute : m_current_token.m_tag.attributes) {
|
||||
builder.append(attribute.name_builder.to_string());
|
||||
builder.append("=\"");
|
||||
builder.append(attribute.value_builder.to_string());
|
||||
builder.append("\" ");
|
||||
}
|
||||
builder.append("} }");
|
||||
}
|
||||
|
||||
if (m_current_token.type() == HTMLToken::Type::Comment || m_current_token.type() == HTMLToken::Type::Character) {
|
||||
builder.append(" { data: '");
|
||||
builder.append(m_current_token.m_comment_or_character.data.to_string());
|
||||
builder.append(" }");
|
||||
}
|
||||
|
||||
dbg() << "[" << String::format("%42s", state_name(m_state)) << "] " << builder.to_string();
|
||||
m_current_token = {};
|
||||
}
|
||||
|
||||
void HTMLTokenizer::create_new_token(HTMLToken::Type type)
|
||||
{
|
||||
flush_current_character_or_comment_if_needed();
|
||||
|
@ -822,8 +774,8 @@ void HTMLTokenizer::will_reconsume_in([[maybe_unused]] State new_state)
|
|||
|
||||
void HTMLTokenizer::flush_current_character_or_comment_if_needed()
|
||||
{
|
||||
if (m_current_token.type() == HTMLToken::Type::Character || m_current_token.type() == HTMLToken::Type::Comment)
|
||||
emit_current_token();
|
||||
//if (m_current_token.type() == HTMLToken::Type::Character || m_current_token.type() == HTMLToken::Type::Comment)
|
||||
// emit_current_token();
|
||||
}
|
||||
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue