LibWeb: Plumb content encoding into the new HTML parser

We still don't handle non-ASCII input correctly, but at least now we'll convert e.g ISO-8859-1 to UTF-8 before starting to tokenize. This patch also makes "view source" work with the new parser. :^)
Author: https://github.com/awesomekling Commit: 5e53c45113
2025-07-05 08:31:51 +00:00 · 2020-05-28 12:35:19 +02:00 · 2020-05-28 12:35:19 +02:00 · 5e53c45113 · 2024-07-19 06:02:35 +09:00
commit 5e53c45113
parent 772b51038e
6 changed files with 18 additions and 9 deletions
--- a/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
+++ b/Libraries/LibWeb/Parser/HTMLTokenizer.cpp
@ -24,6 +24,7 @@
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

+#include <LibTextCodec/Decoder.h>
 #include <LibWeb/Parser/Entities.h>
 #include <LibWeb/Parser/HTMLToken.h>
 #include <LibWeb/Parser/HTMLTokenizer.h>
@ -1711,9 +1712,12 @@ void HTMLTokenizer::create_new_token(HTMLToken::Type type)
    m_current_token.m_type = type;
 }

-HTMLTokenizer::HTMLTokenizer(const StringView& input)
-    : m_input(input)
+HTMLTokenizer::HTMLTokenizer(const StringView& input, const String& encoding)
 {
+    auto* decoder = TextCodec::decoder_for(encoding);
+    ASSERT(decoder);
+    m_decoded_input = decoder->to_utf8(input);
+    m_input = m_decoded_input;
 }

 void HTMLTokenizer::will_switch_to([[maybe_unused]] State new_state)