LibWeb: Make the new HTML parser parse input as UTF-8

We already convert the input to UTF-8 before starting the tokenizer,
so all this patch had to do was switch the tokenizer to use an Utf8View
for its input (and to emit 32-bit codepoints.)
This commit is contained in:
Andreas Kling 2020-06-04 21:06:54 +02:00
parent 23dad305e9
commit b6288163f1
Notes: sideshowbarker 2024-07-19 05:49:51 +09:00
3 changed files with 75 additions and 49 deletions

View file

@ -29,6 +29,7 @@
#include <AK/Queue.h>
#include <AK/StringView.h>
#include <AK/Types.h>
#include <AK/Utf8View.h>
#include <LibWeb/Forward.h>
#include <LibWeb/Parser/HTMLToken.h>
@ -170,6 +171,10 @@ private:
StringView m_input;
size_t m_cursor { 0 };
Utf8View m_utf8_view;
AK::Utf8CodepointIterator m_utf8_iterator;
AK::Utf8CodepointIterator m_prev_utf8_iterator;
HTMLToken m_current_token;
HTMLToken m_last_emitted_start_tag;