mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-06-03 00:42:54 +00:00
LibWeb: Implement the Data state for the Swift tokenizer
And add tests! This implementation closely follows the current C++ implementation, replacing macros and gotos with a slightly more complex state machine. It's very possible that an async version that yields tokens on "emit" would be even simpler, but let's get this one working first :).
This commit is contained in:
parent
01c4625a42
commit
77718c0a66
Notes:
github-actions[bot]
2024-08-29 04:32:14 +00:00
Author: https://github.com/ADKaster
Commit: 77718c0a66
Pull-request: https://github.com/LadybirdBrowser/ladybird/pull/1220
3 changed files with 296 additions and 21 deletions
|
@ -20,9 +20,9 @@ extension Swift.String {
|
|||
}
|
||||
}
|
||||
|
||||
class HTMLTokenizer {
|
||||
public class HTMLTokenizer {
|
||||
|
||||
enum State {
|
||||
public enum State {
|
||||
case Data
|
||||
case RCDATA
|
||||
case RAWTEXT
|
||||
|
@ -105,29 +105,224 @@ class HTMLTokenizer {
|
|||
case NumericCharacterReferenceEnd
|
||||
}
|
||||
|
||||
var input = Swift.String()
|
||||
var state = State.Data
|
||||
var returnState = State.Data
|
||||
private var input = Swift.String()
|
||||
private var cursor: Swift.String.Index
|
||||
private var previousCursor: Swift.String.Index
|
||||
|
||||
var currentToken = HTMLToken()
|
||||
var queuedTokens = Deque<HTMLToken>()
|
||||
public private(set) var state = State.Data
|
||||
private var returnState = State.Data
|
||||
|
||||
public init() {}
|
||||
private var currentToken = HTMLToken()
|
||||
private var queuedTokens = Deque<HTMLToken>()
|
||||
|
||||
private var aborted = false
|
||||
private var hasEmittedEOF = false
|
||||
|
||||
public init() {
|
||||
self.cursor = self.input.startIndex
|
||||
self.previousCursor = self.input.startIndex
|
||||
}
|
||||
public init?(input: AK.StringView, encoding: AK.StringView) {
|
||||
if let string = Swift.String(decoding: input, as: encoding) {
|
||||
self.input = string
|
||||
} else {
|
||||
return nil
|
||||
}
|
||||
self.cursor = self.input.startIndex
|
||||
self.previousCursor = self.input.startIndex
|
||||
}
|
||||
|
||||
public convenience init?(input: AK.StringView) {
|
||||
self.init(input: input, encoding: "UTF-8")
|
||||
}
|
||||
|
||||
public func abort() {
|
||||
self.aborted = true
|
||||
}
|
||||
|
||||
func skip(_ count: Int) {
|
||||
self.cursor = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.endIndex) ?? input.endIndex
|
||||
self.previousCursor = self.input.index(before: self.cursor)
|
||||
}
|
||||
|
||||
func peekCodePoint(_ offset: Int = 0) -> Character? {
|
||||
guard let index = self.input.index(self.cursor, offsetBy: offset, limitedBy: self.input.index(before: self.input.endIndex)) else {
|
||||
return nil
|
||||
}
|
||||
return self.input[index]
|
||||
}
|
||||
|
||||
func nextCodePoint() -> Character? {
|
||||
guard self.cursor < self.input.endIndex else {
|
||||
return nil
|
||||
}
|
||||
|
||||
// https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
|
||||
// https://infra.spec.whatwg.org/#normalize-newlines
|
||||
var codePoint: Character
|
||||
if let peeked = peekCodePoint(), let peekedNext = peekCodePoint(1), peeked == "\r", peekedNext == "\n" {
|
||||
// replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
|
||||
skip(2)
|
||||
codePoint = "\n"
|
||||
} else if let peeked = peekCodePoint(), peeked == "\r" {
|
||||
// replace every remaining U+000D CR code point with a U+000A LF code point.
|
||||
skip(1)
|
||||
codePoint = "\n"
|
||||
} else {
|
||||
skip(1)
|
||||
codePoint = self.input[self.previousCursor]
|
||||
}
|
||||
return codePoint
|
||||
}
|
||||
|
||||
func restoreCursorToPrevious() {
|
||||
self.cursor = self.previousCursor
|
||||
}
|
||||
|
||||
func createNewToken(_ token: HTMLToken) {
|
||||
self.currentToken = token
|
||||
// FIXME: Assign Position
|
||||
}
|
||||
|
||||
enum NextTokenState {
|
||||
case Emit(token: HTMLToken?)
|
||||
case SwitchTo
|
||||
case Reconsume(inputCharacter: Character?)
|
||||
case ReprocessQueue
|
||||
}
|
||||
|
||||
public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
|
||||
|
||||
while !queuedTokens.isEmpty {
|
||||
return queuedTokens.popFirst()
|
||||
let processQueue = { () -> HTMLToken?? in
|
||||
if let token = self.queuedTokens.popFirst() {
|
||||
return token
|
||||
}
|
||||
return self.aborted ? Optional(nil) : nil
|
||||
}
|
||||
|
||||
return nil
|
||||
if let maybeToken = processQueue() {
|
||||
return maybeToken
|
||||
}
|
||||
|
||||
var nextInputCharacter: Character? = nil
|
||||
while true {
|
||||
// FIXME: Handle insertion point
|
||||
switch nextTokenImpl(nextInputCharacter) {
|
||||
case .Emit(let token):
|
||||
return token
|
||||
case .SwitchTo:
|
||||
nextInputCharacter = nil
|
||||
break
|
||||
case .Reconsume(let character):
|
||||
nextInputCharacter = character
|
||||
break
|
||||
case .ReprocessQueue:
|
||||
if let maybeToken = processQueue() {
|
||||
return maybeToken
|
||||
}
|
||||
nextInputCharacter = nil
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func switchTo(_ state: State) -> NextTokenState {
|
||||
self.state = state
|
||||
return .SwitchTo
|
||||
}
|
||||
|
||||
func reconsume(_ character: Character, `in` state: State) -> NextTokenState {
|
||||
self.state = state
|
||||
return .Reconsume(inputCharacter: character)
|
||||
}
|
||||
|
||||
func switchToReturnState() -> NextTokenState {
|
||||
self.state = self.returnState
|
||||
return .ReprocessQueue
|
||||
}
|
||||
|
||||
func reconsumeInReturnState(_ character: Character?) -> NextTokenState {
|
||||
self.state = self.returnState
|
||||
if character != nil {
|
||||
restoreCursorToPrevious()
|
||||
}
|
||||
return .ReprocessQueue
|
||||
}
|
||||
|
||||
func switchToAndEmitCurrentToken(_ state: State) -> NextTokenState {
|
||||
self.state = state
|
||||
self.queuedTokens.append(self.currentToken)
|
||||
self.currentToken = HTMLToken()
|
||||
return .Emit(token: self.queuedTokens.popFirst()!)
|
||||
}
|
||||
|
||||
func switchToAndEmitCharacter(_ state: State, character: Character) -> NextTokenState {
|
||||
self.state = state
|
||||
return emitCharacter(character)
|
||||
}
|
||||
|
||||
func emitCharacterAndReconsume(_ character: Character, `in`: State, currentInputCharacter: Character?) -> NextTokenState {
|
||||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: character)))
|
||||
self.state = `in`
|
||||
return .Reconsume(inputCharacter: currentInputCharacter)
|
||||
}
|
||||
|
||||
func emitEOF() -> NextTokenState {
|
||||
if self.hasEmittedEOF {
|
||||
return .Emit(token: nil)
|
||||
}
|
||||
self.hasEmittedEOF = true
|
||||
createNewToken(HTMLToken(type: .EndOfFile))
|
||||
self.queuedTokens.append(self.currentToken)
|
||||
self.currentToken = HTMLToken()
|
||||
return .Emit(token: self.queuedTokens.popFirst()!)
|
||||
}
|
||||
|
||||
func emitCurrentTokenFollowedByEOF() -> NextTokenState {
|
||||
precondition(!self.hasEmittedEOF)
|
||||
self.queuedTokens.append(self.currentToken)
|
||||
self.currentToken = HTMLToken()
|
||||
return emitEOF()
|
||||
}
|
||||
|
||||
func emitCharacter(_ character: Character) -> NextTokenState {
|
||||
createNewToken(HTMLToken(type: .Character(codePoint: character)))
|
||||
self.queuedTokens.append(self.currentToken)
|
||||
self.currentToken = HTMLToken()
|
||||
return .Emit(token: self.queuedTokens.popFirst()!)
|
||||
}
|
||||
|
||||
func nextTokenImpl(_ nextInputCharacter: Character? = nil) -> NextTokenState {
|
||||
let dontConsumeNextInputCharacter = {
|
||||
self.restoreCursorToPrevious()
|
||||
}
|
||||
let _ = dontConsumeNextInputCharacter
|
||||
|
||||
// FIXME: flushCodepointsConsumedAsACharacterReference needs currentBuilder
|
||||
|
||||
// Handle reconsume by passing the character around in the state enum
|
||||
let currentInputCharacter = nextInputCharacter ?? nextCodePoint()
|
||||
|
||||
switch self.state {
|
||||
// 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
|
||||
case .Data:
|
||||
switch currentInputCharacter {
|
||||
case "&":
|
||||
self.returnState = .Data
|
||||
return switchTo(.CharacterReference)
|
||||
case "<":
|
||||
return switchTo(.TagOpen)
|
||||
case "\0":
|
||||
// FIXME: log_parse_error()
|
||||
return emitCharacter("\u{FFFD}")
|
||||
case nil:
|
||||
return emitEOF()
|
||||
default:
|
||||
return emitCharacter(currentInputCharacter!)
|
||||
}
|
||||
default:
|
||||
print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))")
|
||||
return emitEOF()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue