diff --git a/Userland/Libraries/LibWeb/CMakeLists.txt b/Userland/Libraries/LibWeb/CMakeLists.txt index bf4a4d7b147..86801ec5641 100644 --- a/Userland/Libraries/LibWeb/CMakeLists.txt +++ b/Userland/Libraries/LibWeb/CMakeLists.txt @@ -789,6 +789,8 @@ target_link_libraries(LibWeb PRIVATE LibCore LibCrypto LibJS LibHTTP LibGfx LibI generate_js_bindings(LibWeb) if (ENABLE_SWIFT) + include(collections) + set(generated_headers ${GENERATED_SOURCES}) list(FILTER generated_headers INCLUDE REGEX "\\.h$") list(TRANSFORM generated_headers PREPEND "${CMAKE_CURRENT_BINARY_DIR}/") @@ -800,10 +802,14 @@ if (ENABLE_SWIFT) target_sources(LibWeb PRIVATE HTML/Parser/HTMLToken.swift + HTML/Parser/HTMLTokenizer.swift + HTML/Parser/HTMLTokenizerHelpers.cpp ) target_compile_definitions(LibWeb PRIVATE LIBWEB_USE_SWIFT) set_target_properties(LibWeb PROPERTIES Swift_MODULE_NAME "SwiftLibWeb") + target_link_libraries(LibWeb PRIVATE AK Collections) + # FIXME: These should be pulled automatically from interface compile options for the target set(VFS_OVERLAY_OPTIONS -Xcc -ivfsoverlay${CMAKE_CURRENT_BINARY_DIR}/vfs_overlay.yaml @@ -811,6 +817,8 @@ if (ENABLE_SWIFT) -Xcc -ivfsoverlay${Lagom_BINARY_DIR}/AK/vfs_overlay.yaml ) get_target_property(LIBWEB_NATIVE_DIRS LibWeb INCLUDE_DIRECTORIES) + list(APPEND LIBWEB_NATIVE_DIRS ${CMAKE_Swift_MODULE_DIRECTORY}) + _swift_generate_cxx_header(LibWeb "LibWeb-Swift.h" SEARCH_PATHS ${LIBWEB_NATIVE_DIRS} COMPILE_OPTIONS ${VFS_OVERLAY_OPTIONS} diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift new file mode 100644 index 00000000000..c64048947ca --- /dev/null +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2024, Andrew Kaster > + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +import Collections +import Foundation +import LibWeb +import SwiftAK + +extension Swift.String { + public init?(decoding: AK.StringView, as: AK.StringView) { + let maybe_decoded = Web.HTML.decode_to_utf8(decoding, `as`) + if maybe_decoded.hasValue { + self.init(maybe_decoded.value!) + } else { + return nil + } + } +} + +class HTMLTokenizer { + + enum State { + case Data + case RCDATA + case RAWTEXT + case ScriptData + case PLAINTEXT + case TagOpen + case EndTagOpen + case TagName + case RCDATALessThanSign + case RCDATAEndTagOpen + case RCDATAEndTagName + case RAWTEXTLessThanSign + case RAWTEXTEndTagOpen + case RAWTEXTEndTagName + case ScriptDataLessThanSign + case ScriptDataEndTagOpen + case ScriptDataEndTagName + case ScriptDataEscapeStart + case ScriptDataEscapeStartDash + case ScriptDataEscaped + case ScriptDataEscapedDash + case ScriptDataEscapedDashDash + case ScriptDataEscapedLessThanSign + case ScriptDataEscapedEndTagOpen + case ScriptDataEscapedEndTagName + case ScriptDataDoubleEscapeStart + case ScriptDataDoubleEscaped + case ScriptDataDoubleEscapedDash + case ScriptDataDoubleEscapedDashDash + case ScriptDataDoubleEscapedLessThanSign + case ScriptDataDoubleEscapeEnd + case BeforeAttributeName + case AttributeName + case AfterAttributeName + case BeforeAttributeValue + case AttributeValueDoubleQuoted + case AttributeValueSingleQuoted + case AttributeValueUnquoted + case AfterAttributeValueQuoted + case SelfClosingStartTag + case BogusComment + case MarkupDeclarationOpen + case CommentStart + case CommentStartDash + case Comment + case CommentLessThanSign + case CommentLessThanSignBang + case CommentLessThanSignBangDash + case CommentLessThanSignBangDashDash + case CommentEndDash + case CommentEnd + case CommentEndBang + case DOCTYPE + case BeforeDOCTYPEName + case DOCTYPEName + case AfterDOCTYPEName + case AfterDOCTYPEPublicKeyword + case BeforeDOCTYPEPublicIdentifier + case DOCTYPEPublicIdentifierDoubleQuoted + case DOCTYPEPublicIdentifierSingleQuoted + case AfterDOCTYPEPublicIdentifier + case BetweenDOCTYPEPublicAndSystemIdentifiers + case AfterDOCTYPESystemKeyword + case BeforeDOCTYPESystemIdentifier + case DOCTYPESystemIdentifierDoubleQuoted + case DOCTYPESystemIdentifierSingleQuoted + case AfterDOCTYPESystemIdentifier + case BogusDOCTYPE + case CDATASection + case CDATASectionBracket + case CDATASectionEnd + case CharacterReference + case NamedCharacterReference + case AmbiguousAmpersand + case NumericCharacterReference + case HexadecimalCharacterReferenceStart + case DecimalCharacterReferenceStart + case HexadecimalCharacterReference + case DecimalCharacterReference + case NumericCharacterReferenceEnd + } + + var input = Swift.String() + var state = State.Data + var returnState = State.Data + + var currentToken = HTMLToken() + var queuedTokens = Deque() + + public init() {} + public init?(input: AK.StringView, encoding: AK.StringView) { + if let string = Swift.String(decoding: input, as: encoding) { + self.input = string + } else { + return nil + } + } + + public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? { + + while !queuedTokens.isEmpty { + return queuedTokens.popFirst() + } + + return nil + } + +} diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.cpp b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.cpp new file mode 100644 index 00000000000..888e54f28f2 --- /dev/null +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.cpp @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2024, Andrew Kaster + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#include +#include + +namespace Web::HTML { + +OptionalString decode_to_utf8(StringView text, StringView encoding) +{ + auto decoder = TextCodec::decoder_for(encoding); + if (!decoder.has_value()) + return std::nullopt; + auto decoded_or_error = decoder.value().to_utf8(text); + if (decoded_or_error.is_error()) + return std::nullopt; + return decoded_or_error.release_value(); +} + +} diff --git a/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.h b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.h new file mode 100644 index 00000000000..3902cb29ea7 --- /dev/null +++ b/Userland/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.h @@ -0,0 +1,19 @@ +/* + * Copyright (c) 2024, Andrew Kaster + * + * SPDX-License-Identifier: BSD-2-Clause + */ + +#pragma once + +#include +#include +#include + +namespace Web::HTML { + +// Swift-friendly wrapper for TextCodec::Decoder::to_utf8 +using OptionalString = std::optional; +OptionalString decode_to_utf8(StringView text, StringView encoding); + +}