mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-07-29 04:09:13 +00:00
LibWeb: Add an HTML tokenizer re-implementation in swift
It doesn't do much yet, the fun part was the scaffolding
This commit is contained in:
parent
4ba274691e
commit
49733ed09b
Notes:
github-actions[bot]
2024-08-25 01:15:04 +00:00
Author: https://github.com/ADKaster
Commit: 49733ed09b
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/1175
Reviewed-by: https://github.com/AtkinsSJ
4 changed files with 183 additions and 0 deletions
|
@ -789,6 +789,8 @@ target_link_libraries(LibWeb PRIVATE LibCore LibCrypto LibJS LibHTTP LibGfx LibI
|
||||||
generate_js_bindings(LibWeb)
|
generate_js_bindings(LibWeb)
|
||||||
|
|
||||||
if (ENABLE_SWIFT)
|
if (ENABLE_SWIFT)
|
||||||
|
include(collections)
|
||||||
|
|
||||||
set(generated_headers ${GENERATED_SOURCES})
|
set(generated_headers ${GENERATED_SOURCES})
|
||||||
list(FILTER generated_headers INCLUDE REGEX "\\.h$")
|
list(FILTER generated_headers INCLUDE REGEX "\\.h$")
|
||||||
list(TRANSFORM generated_headers PREPEND "${CMAKE_CURRENT_BINARY_DIR}/")
|
list(TRANSFORM generated_headers PREPEND "${CMAKE_CURRENT_BINARY_DIR}/")
|
||||||
|
@ -800,10 +802,14 @@ if (ENABLE_SWIFT)
|
||||||
|
|
||||||
target_sources(LibWeb PRIVATE
|
target_sources(LibWeb PRIVATE
|
||||||
HTML/Parser/HTMLToken.swift
|
HTML/Parser/HTMLToken.swift
|
||||||
|
HTML/Parser/HTMLTokenizer.swift
|
||||||
|
HTML/Parser/HTMLTokenizerHelpers.cpp
|
||||||
)
|
)
|
||||||
target_compile_definitions(LibWeb PRIVATE LIBWEB_USE_SWIFT)
|
target_compile_definitions(LibWeb PRIVATE LIBWEB_USE_SWIFT)
|
||||||
set_target_properties(LibWeb PROPERTIES Swift_MODULE_NAME "SwiftLibWeb")
|
set_target_properties(LibWeb PROPERTIES Swift_MODULE_NAME "SwiftLibWeb")
|
||||||
|
|
||||||
|
target_link_libraries(LibWeb PRIVATE AK Collections)
|
||||||
|
|
||||||
# FIXME: These should be pulled automatically from interface compile options for the target
|
# FIXME: These should be pulled automatically from interface compile options for the target
|
||||||
set(VFS_OVERLAY_OPTIONS
|
set(VFS_OVERLAY_OPTIONS
|
||||||
-Xcc -ivfsoverlay${CMAKE_CURRENT_BINARY_DIR}/vfs_overlay.yaml
|
-Xcc -ivfsoverlay${CMAKE_CURRENT_BINARY_DIR}/vfs_overlay.yaml
|
||||||
|
@ -811,6 +817,8 @@ if (ENABLE_SWIFT)
|
||||||
-Xcc -ivfsoverlay${Lagom_BINARY_DIR}/AK/vfs_overlay.yaml
|
-Xcc -ivfsoverlay${Lagom_BINARY_DIR}/AK/vfs_overlay.yaml
|
||||||
)
|
)
|
||||||
get_target_property(LIBWEB_NATIVE_DIRS LibWeb INCLUDE_DIRECTORIES)
|
get_target_property(LIBWEB_NATIVE_DIRS LibWeb INCLUDE_DIRECTORIES)
|
||||||
|
list(APPEND LIBWEB_NATIVE_DIRS ${CMAKE_Swift_MODULE_DIRECTORY})
|
||||||
|
|
||||||
_swift_generate_cxx_header(LibWeb "LibWeb-Swift.h"
|
_swift_generate_cxx_header(LibWeb "LibWeb-Swift.h"
|
||||||
SEARCH_PATHS ${LIBWEB_NATIVE_DIRS}
|
SEARCH_PATHS ${LIBWEB_NATIVE_DIRS}
|
||||||
COMPILE_OPTIONS ${VFS_OVERLAY_OPTIONS}
|
COMPILE_OPTIONS ${VFS_OVERLAY_OPTIONS}
|
||||||
|
|
133
Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift
Normal file
133
Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift
Normal file
|
@ -0,0 +1,133 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2024, Andrew Kaster <andrew@ladybird.org>>
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
|
*/
|
||||||
|
|
||||||
|
import Collections
|
||||||
|
import Foundation
|
||||||
|
import LibWeb
|
||||||
|
import SwiftAK
|
||||||
|
|
||||||
|
extension Swift.String {
|
||||||
|
public init?(decoding: AK.StringView, as: AK.StringView) {
|
||||||
|
let maybe_decoded = Web.HTML.decode_to_utf8(decoding, `as`)
|
||||||
|
if maybe_decoded.hasValue {
|
||||||
|
self.init(maybe_decoded.value!)
|
||||||
|
} else {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
class HTMLTokenizer {
|
||||||
|
|
||||||
|
enum State {
|
||||||
|
case Data
|
||||||
|
case RCDATA
|
||||||
|
case RAWTEXT
|
||||||
|
case ScriptData
|
||||||
|
case PLAINTEXT
|
||||||
|
case TagOpen
|
||||||
|
case EndTagOpen
|
||||||
|
case TagName
|
||||||
|
case RCDATALessThanSign
|
||||||
|
case RCDATAEndTagOpen
|
||||||
|
case RCDATAEndTagName
|
||||||
|
case RAWTEXTLessThanSign
|
||||||
|
case RAWTEXTEndTagOpen
|
||||||
|
case RAWTEXTEndTagName
|
||||||
|
case ScriptDataLessThanSign
|
||||||
|
case ScriptDataEndTagOpen
|
||||||
|
case ScriptDataEndTagName
|
||||||
|
case ScriptDataEscapeStart
|
||||||
|
case ScriptDataEscapeStartDash
|
||||||
|
case ScriptDataEscaped
|
||||||
|
case ScriptDataEscapedDash
|
||||||
|
case ScriptDataEscapedDashDash
|
||||||
|
case ScriptDataEscapedLessThanSign
|
||||||
|
case ScriptDataEscapedEndTagOpen
|
||||||
|
case ScriptDataEscapedEndTagName
|
||||||
|
case ScriptDataDoubleEscapeStart
|
||||||
|
case ScriptDataDoubleEscaped
|
||||||
|
case ScriptDataDoubleEscapedDash
|
||||||
|
case ScriptDataDoubleEscapedDashDash
|
||||||
|
case ScriptDataDoubleEscapedLessThanSign
|
||||||
|
case ScriptDataDoubleEscapeEnd
|
||||||
|
case BeforeAttributeName
|
||||||
|
case AttributeName
|
||||||
|
case AfterAttributeName
|
||||||
|
case BeforeAttributeValue
|
||||||
|
case AttributeValueDoubleQuoted
|
||||||
|
case AttributeValueSingleQuoted
|
||||||
|
case AttributeValueUnquoted
|
||||||
|
case AfterAttributeValueQuoted
|
||||||
|
case SelfClosingStartTag
|
||||||
|
case BogusComment
|
||||||
|
case MarkupDeclarationOpen
|
||||||
|
case CommentStart
|
||||||
|
case CommentStartDash
|
||||||
|
case Comment
|
||||||
|
case CommentLessThanSign
|
||||||
|
case CommentLessThanSignBang
|
||||||
|
case CommentLessThanSignBangDash
|
||||||
|
case CommentLessThanSignBangDashDash
|
||||||
|
case CommentEndDash
|
||||||
|
case CommentEnd
|
||||||
|
case CommentEndBang
|
||||||
|
case DOCTYPE
|
||||||
|
case BeforeDOCTYPEName
|
||||||
|
case DOCTYPEName
|
||||||
|
case AfterDOCTYPEName
|
||||||
|
case AfterDOCTYPEPublicKeyword
|
||||||
|
case BeforeDOCTYPEPublicIdentifier
|
||||||
|
case DOCTYPEPublicIdentifierDoubleQuoted
|
||||||
|
case DOCTYPEPublicIdentifierSingleQuoted
|
||||||
|
case AfterDOCTYPEPublicIdentifier
|
||||||
|
case BetweenDOCTYPEPublicAndSystemIdentifiers
|
||||||
|
case AfterDOCTYPESystemKeyword
|
||||||
|
case BeforeDOCTYPESystemIdentifier
|
||||||
|
case DOCTYPESystemIdentifierDoubleQuoted
|
||||||
|
case DOCTYPESystemIdentifierSingleQuoted
|
||||||
|
case AfterDOCTYPESystemIdentifier
|
||||||
|
case BogusDOCTYPE
|
||||||
|
case CDATASection
|
||||||
|
case CDATASectionBracket
|
||||||
|
case CDATASectionEnd
|
||||||
|
case CharacterReference
|
||||||
|
case NamedCharacterReference
|
||||||
|
case AmbiguousAmpersand
|
||||||
|
case NumericCharacterReference
|
||||||
|
case HexadecimalCharacterReferenceStart
|
||||||
|
case DecimalCharacterReferenceStart
|
||||||
|
case HexadecimalCharacterReference
|
||||||
|
case DecimalCharacterReference
|
||||||
|
case NumericCharacterReferenceEnd
|
||||||
|
}
|
||||||
|
|
||||||
|
var input = Swift.String()
|
||||||
|
var state = State.Data
|
||||||
|
var returnState = State.Data
|
||||||
|
|
||||||
|
var currentToken = HTMLToken()
|
||||||
|
var queuedTokens = Deque<HTMLToken>()
|
||||||
|
|
||||||
|
public init() {}
|
||||||
|
public init?(input: AK.StringView, encoding: AK.StringView) {
|
||||||
|
if let string = Swift.String(decoding: input, as: encoding) {
|
||||||
|
self.input = string
|
||||||
|
} else {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
|
||||||
|
|
||||||
|
while !queuedTokens.isEmpty {
|
||||||
|
return queuedTokens.popFirst()
|
||||||
|
}
|
||||||
|
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
|
@ -0,0 +1,23 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2024, Andrew Kaster <akaster@serenityos.org>
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include <LibTextCodec/Decoder.h>
|
||||||
|
#include <LibWeb/HTML/Parser/HTMLTokenizerHelpers.h>
|
||||||
|
|
||||||
|
namespace Web::HTML {
|
||||||
|
|
||||||
|
OptionalString decode_to_utf8(StringView text, StringView encoding)
|
||||||
|
{
|
||||||
|
auto decoder = TextCodec::decoder_for(encoding);
|
||||||
|
if (!decoder.has_value())
|
||||||
|
return std::nullopt;
|
||||||
|
auto decoded_or_error = decoder.value().to_utf8(text);
|
||||||
|
if (decoded_or_error.is_error())
|
||||||
|
return std::nullopt;
|
||||||
|
return decoded_or_error.release_value();
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
19
Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.h
Normal file
19
Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.h
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2024, Andrew Kaster <akaster@serenityos.org>
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: BSD-2-Clause
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <AK/String.h>
|
||||||
|
#include <AK/StringView.h>
|
||||||
|
#include <optional>
|
||||||
|
|
||||||
|
namespace Web::HTML {
|
||||||
|
|
||||||
|
// Swift-friendly wrapper for TextCodec::Decoder::to_utf8
|
||||||
|
using OptionalString = std::optional<String>;
|
||||||
|
OptionalString decode_to_utf8(StringView text, StringView encoding);
|
||||||
|
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue