mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-07-29 04:09:13 +00:00
LibWeb: Add an HTML tokenizer re-implementation in swift
It doesn't do much yet, the fun part was the scaffolding
This commit is contained in:
parent
4ba274691e
commit
49733ed09b
Notes:
github-actions[bot]
2024-08-25 01:15:04 +00:00
Author: https://github.com/ADKaster
Commit: 49733ed09b
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/1175
Reviewed-by: https://github.com/AtkinsSJ
4 changed files with 183 additions and 0 deletions
|
@ -789,6 +789,8 @@ target_link_libraries(LibWeb PRIVATE LibCore LibCrypto LibJS LibHTTP LibGfx LibI
|
|||
generate_js_bindings(LibWeb)
|
||||
|
||||
if (ENABLE_SWIFT)
|
||||
include(collections)
|
||||
|
||||
set(generated_headers ${GENERATED_SOURCES})
|
||||
list(FILTER generated_headers INCLUDE REGEX "\\.h$")
|
||||
list(TRANSFORM generated_headers PREPEND "${CMAKE_CURRENT_BINARY_DIR}/")
|
||||
|
@ -800,10 +802,14 @@ if (ENABLE_SWIFT)
|
|||
|
||||
target_sources(LibWeb PRIVATE
|
||||
HTML/Parser/HTMLToken.swift
|
||||
HTML/Parser/HTMLTokenizer.swift
|
||||
HTML/Parser/HTMLTokenizerHelpers.cpp
|
||||
)
|
||||
target_compile_definitions(LibWeb PRIVATE LIBWEB_USE_SWIFT)
|
||||
set_target_properties(LibWeb PROPERTIES Swift_MODULE_NAME "SwiftLibWeb")
|
||||
|
||||
target_link_libraries(LibWeb PRIVATE AK Collections)
|
||||
|
||||
# FIXME: These should be pulled automatically from interface compile options for the target
|
||||
set(VFS_OVERLAY_OPTIONS
|
||||
-Xcc -ivfsoverlay${CMAKE_CURRENT_BINARY_DIR}/vfs_overlay.yaml
|
||||
|
@ -811,6 +817,8 @@ if (ENABLE_SWIFT)
|
|||
-Xcc -ivfsoverlay${Lagom_BINARY_DIR}/AK/vfs_overlay.yaml
|
||||
)
|
||||
get_target_property(LIBWEB_NATIVE_DIRS LibWeb INCLUDE_DIRECTORIES)
|
||||
list(APPEND LIBWEB_NATIVE_DIRS ${CMAKE_Swift_MODULE_DIRECTORY})
|
||||
|
||||
_swift_generate_cxx_header(LibWeb "LibWeb-Swift.h"
|
||||
SEARCH_PATHS ${LIBWEB_NATIVE_DIRS}
|
||||
COMPILE_OPTIONS ${VFS_OVERLAY_OPTIONS}
|
||||
|
|
133
Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift
Normal file
133
Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift
Normal file
|
@ -0,0 +1,133 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Andrew Kaster <andrew@ladybird.org>>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
import Collections
|
||||
import Foundation
|
||||
import LibWeb
|
||||
import SwiftAK
|
||||
|
||||
extension Swift.String {
|
||||
public init?(decoding: AK.StringView, as: AK.StringView) {
|
||||
let maybe_decoded = Web.HTML.decode_to_utf8(decoding, `as`)
|
||||
if maybe_decoded.hasValue {
|
||||
self.init(maybe_decoded.value!)
|
||||
} else {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
class HTMLTokenizer {
|
||||
|
||||
enum State {
|
||||
case Data
|
||||
case RCDATA
|
||||
case RAWTEXT
|
||||
case ScriptData
|
||||
case PLAINTEXT
|
||||
case TagOpen
|
||||
case EndTagOpen
|
||||
case TagName
|
||||
case RCDATALessThanSign
|
||||
case RCDATAEndTagOpen
|
||||
case RCDATAEndTagName
|
||||
case RAWTEXTLessThanSign
|
||||
case RAWTEXTEndTagOpen
|
||||
case RAWTEXTEndTagName
|
||||
case ScriptDataLessThanSign
|
||||
case ScriptDataEndTagOpen
|
||||
case ScriptDataEndTagName
|
||||
case ScriptDataEscapeStart
|
||||
case ScriptDataEscapeStartDash
|
||||
case ScriptDataEscaped
|
||||
case ScriptDataEscapedDash
|
||||
case ScriptDataEscapedDashDash
|
||||
case ScriptDataEscapedLessThanSign
|
||||
case ScriptDataEscapedEndTagOpen
|
||||
case ScriptDataEscapedEndTagName
|
||||
case ScriptDataDoubleEscapeStart
|
||||
case ScriptDataDoubleEscaped
|
||||
case ScriptDataDoubleEscapedDash
|
||||
case ScriptDataDoubleEscapedDashDash
|
||||
case ScriptDataDoubleEscapedLessThanSign
|
||||
case ScriptDataDoubleEscapeEnd
|
||||
case BeforeAttributeName
|
||||
case AttributeName
|
||||
case AfterAttributeName
|
||||
case BeforeAttributeValue
|
||||
case AttributeValueDoubleQuoted
|
||||
case AttributeValueSingleQuoted
|
||||
case AttributeValueUnquoted
|
||||
case AfterAttributeValueQuoted
|
||||
case SelfClosingStartTag
|
||||
case BogusComment
|
||||
case MarkupDeclarationOpen
|
||||
case CommentStart
|
||||
case CommentStartDash
|
||||
case Comment
|
||||
case CommentLessThanSign
|
||||
case CommentLessThanSignBang
|
||||
case CommentLessThanSignBangDash
|
||||
case CommentLessThanSignBangDashDash
|
||||
case CommentEndDash
|
||||
case CommentEnd
|
||||
case CommentEndBang
|
||||
case DOCTYPE
|
||||
case BeforeDOCTYPEName
|
||||
case DOCTYPEName
|
||||
case AfterDOCTYPEName
|
||||
case AfterDOCTYPEPublicKeyword
|
||||
case BeforeDOCTYPEPublicIdentifier
|
||||
case DOCTYPEPublicIdentifierDoubleQuoted
|
||||
case DOCTYPEPublicIdentifierSingleQuoted
|
||||
case AfterDOCTYPEPublicIdentifier
|
||||
case BetweenDOCTYPEPublicAndSystemIdentifiers
|
||||
case AfterDOCTYPESystemKeyword
|
||||
case BeforeDOCTYPESystemIdentifier
|
||||
case DOCTYPESystemIdentifierDoubleQuoted
|
||||
case DOCTYPESystemIdentifierSingleQuoted
|
||||
case AfterDOCTYPESystemIdentifier
|
||||
case BogusDOCTYPE
|
||||
case CDATASection
|
||||
case CDATASectionBracket
|
||||
case CDATASectionEnd
|
||||
case CharacterReference
|
||||
case NamedCharacterReference
|
||||
case AmbiguousAmpersand
|
||||
case NumericCharacterReference
|
||||
case HexadecimalCharacterReferenceStart
|
||||
case DecimalCharacterReferenceStart
|
||||
case HexadecimalCharacterReference
|
||||
case DecimalCharacterReference
|
||||
case NumericCharacterReferenceEnd
|
||||
}
|
||||
|
||||
var input = Swift.String()
|
||||
var state = State.Data
|
||||
var returnState = State.Data
|
||||
|
||||
var currentToken = HTMLToken()
|
||||
var queuedTokens = Deque<HTMLToken>()
|
||||
|
||||
public init() {}
|
||||
public init?(input: AK.StringView, encoding: AK.StringView) {
|
||||
if let string = Swift.String(decoding: input, as: encoding) {
|
||||
self.input = string
|
||||
} else {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
|
||||
|
||||
while !queuedTokens.isEmpty {
|
||||
return queuedTokens.popFirst()
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Andrew Kaster <akaster@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#include <LibTextCodec/Decoder.h>
|
||||
#include <LibWeb/HTML/Parser/HTMLTokenizerHelpers.h>
|
||||
|
||||
namespace Web::HTML {
|
||||
|
||||
OptionalString decode_to_utf8(StringView text, StringView encoding)
|
||||
{
|
||||
auto decoder = TextCodec::decoder_for(encoding);
|
||||
if (!decoder.has_value())
|
||||
return std::nullopt;
|
||||
auto decoded_or_error = decoder.value().to_utf8(text);
|
||||
if (decoded_or_error.is_error())
|
||||
return std::nullopt;
|
||||
return decoded_or_error.release_value();
|
||||
}
|
||||
|
||||
}
|
19
Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.h
Normal file
19
Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.h
Normal file
|
@ -0,0 +1,19 @@
|
|||
/*
|
||||
* Copyright (c) 2024, Andrew Kaster <akaster@serenityos.org>
|
||||
*
|
||||
* SPDX-License-Identifier: BSD-2-Clause
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <AK/String.h>
|
||||
#include <AK/StringView.h>
|
||||
#include <optional>
|
||||
|
||||
namespace Web::HTML {
|
||||
|
||||
// Swift-friendly wrapper for TextCodec::Decoder::to_utf8
|
||||
using OptionalString = std::optional<String>;
|
||||
OptionalString decode_to_utf8(StringView text, StringView encoding);
|
||||
|
||||
}
|
Loading…
Add table
Add a link
Reference in a new issue