LibWeb: Plumb content encoding into the new HTML parser

We still don't handle non-ASCII input correctly, but at least now we'll
convert e.g ISO-8859-1 to UTF-8 before starting to tokenize.
This patch also makes "view source" work with the new parser. :^)
This commit is contained in:
Andreas Kling 2020-05-28 12:35:19 +02:00
parent 772b51038e
commit 5e53c45113
Notes: sideshowbarker 2024-07-19 06:02:35 +09:00
6 changed files with 18 additions and 9 deletions

View file

@ -24,6 +24,7 @@
* OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#include <LibTextCodec/Decoder.h>
#include <LibWeb/Parser/Entities.h>
#include <LibWeb/Parser/HTMLToken.h>
#include <LibWeb/Parser/HTMLTokenizer.h>
@ -1711,9 +1712,12 @@ void HTMLTokenizer::create_new_token(HTMLToken::Type type)
m_current_token.m_type = type;
}
HTMLTokenizer::HTMLTokenizer(const StringView& input)
: m_input(input)
HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding)
{
auto* decoder = TextCodec::decoder_for(encoding);
ASSERT(decoder);
m_decoded_input = decoder->to_utf8(input);
m_input = m_decoded_input;
}
void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)