mirror of
https://github.com/LadybirdBrowser/ladybird.git
synced 2025-04-26 14:28:49 +00:00
There are two changes happening here: a correctness fix, and an optimization. In theory they are unrelated, but the optimization actually paves the way for the correctness fix. Before this commit, the HTML tokenizer would attempt to look for named character references by checking from after the `&` until the end of m_decoded_input, which meant that it was unable to recognize things like named character references that are inserted via `document.write` one byte at a time. For example, if `∉` was written one-byte-at-a-time with `document.write`, then the tokenizer would only check against `n` since that's all that would exist at the time of the check and therefore erroneously conclude that it was an invalid named character reference. This commit modifies the approach taken for named character reference matching by using a trie-like structure (specifically, a deterministic acyclic finite state automaton or DAFSA), which allows for efficiently matching one-character-at-a-time and therefore it is able to pick up matching where it left off after each code point is consumed. Note: Because it's possible for a partial match to not actually develop into a full match (e.g. `¬indo` which could lead to `⋵̸`), some backtracking is performed after-the-fact in order to only consume the code points within the longest match found (e.g. `¬indo` would backtrack back to `¬`). With this new approach, `document.write` being called one-byte-at-a-time is handled correctly, which allows for passing more WPT tests, with the most directly relevant tests being `/html/syntax/parsing/html5lib_entities01.html` and `/html/syntax/parsing/html5lib_entities02.html` when run with `?run_type=write_single`. Additionally, the implementation now better conforms to the language of the spec (and resolves a FIXME) because exactly the matched characters are consumed and nothing more, so SWITCH_TO is able to be used as the spec says instead of RECONSUME_IN. The new approach is also an optimization: - Instead of a linear search using `starts_with`, the usage of a DAFSA means that it is always aware of which characters can lead to a match at any given point, and will bail out whenever a match is no longer possible. - The DAFSA is able to take advantage of the note in the section `13.5 Named character references` that says "This list is static and will not be expanded or changed in the future." and tailor its Node struct accordingly to tightly pack each node's data into 32-bits. Together with the inherent DAFSA property of redundant node deduplication, the amount of data stored for named character reference matching is minimized. In my testing: - A benchmark tokenizing an arbitrary set of HTML test files was about 1.23x faster (2070ms to 1682ms). - A benchmark tokenizing a file with tens of thousands of named character references mixed in with truncated named character references and arbitrary ASCII characters/ampersands runs about 8x faster (758ms to 93ms). - The size of `liblagom-web.so` was reduced by 94.96KiB. Some technical details: A DAFSA (deterministic acyclic finite state automaton) is essentially a trie flattened into an array, but it also uses techniques to minimize redundant nodes. This provides fast lookups while minimizing the required data size, but normally does not allow for associating data related to each word. However, by adding a count of the number of possible words from each node, it becomes possible to also use it to achieve minimal perfect hashing for the set of words (which allows going from word -> unique index as well as unique index -> word). This allows us to store a second array of data so that the DAFSA can be used as a lookup for e.g. the associated code points. For the Swift implementation, the new NamedCharacterReferenceMatcher was used to satisfy the previous API and the tokenizer was left alone otherwise. In the future, the Swift implementation should be updated to use the same implementation for its NamedCharacterReference state as the updated C++ implementation.
1928 lines
91 KiB
Swift
1928 lines
91 KiB
Swift
/*
|
||
* Copyright (c) 2024, Andrew Kaster <andrew@ladybird.org>>
|
||
*
|
||
* SPDX-License-Identifier: BSD-2-Clause
|
||
*/
|
||
|
||
import AK
|
||
import Collections
|
||
import Foundation
|
||
@_exported import WebCxx
|
||
|
||
extension Swift.String {
|
||
public init?(decoding: AK.StringView, as: AK.StringView) {
|
||
let maybe_decoded = Web.HTML.decode_to_utf8(decoding, `as`)
|
||
if maybe_decoded.hasValue {
|
||
self.init(akString: maybe_decoded.value!)
|
||
} else {
|
||
return nil
|
||
}
|
||
}
|
||
|
||
public mutating func takeString() -> Swift.String {
|
||
let result = self
|
||
self = ""
|
||
return result
|
||
}
|
||
}
|
||
|
||
public class HTMLTokenizer {
|
||
|
||
public enum State {
|
||
case Data
|
||
case RCDATA
|
||
case RAWTEXT
|
||
case ScriptData
|
||
case PLAINTEXT
|
||
case TagOpen
|
||
case EndTagOpen
|
||
case TagName
|
||
case RCDATALessThanSign
|
||
case RCDATAEndTagOpen
|
||
case RCDATAEndTagName
|
||
case RAWTEXTLessThanSign
|
||
case RAWTEXTEndTagOpen
|
||
case RAWTEXTEndTagName
|
||
case ScriptDataLessThanSign
|
||
case ScriptDataEndTagOpen
|
||
case ScriptDataEndTagName
|
||
case ScriptDataEscapeStart
|
||
case ScriptDataEscapeStartDash
|
||
case ScriptDataEscaped
|
||
case ScriptDataEscapedDash
|
||
case ScriptDataEscapedDashDash
|
||
case ScriptDataEscapedLessThanSign
|
||
case ScriptDataEscapedEndTagOpen
|
||
case ScriptDataEscapedEndTagName
|
||
case ScriptDataDoubleEscapeStart
|
||
case ScriptDataDoubleEscaped
|
||
case ScriptDataDoubleEscapedDash
|
||
case ScriptDataDoubleEscapedDashDash
|
||
case ScriptDataDoubleEscapedLessThanSign
|
||
case ScriptDataDoubleEscapeEnd
|
||
case BeforeAttributeName
|
||
case AttributeName
|
||
case AfterAttributeName
|
||
case BeforeAttributeValue
|
||
case AttributeValueDoubleQuoted
|
||
case AttributeValueSingleQuoted
|
||
case AttributeValueUnquoted
|
||
case AfterAttributeValueQuoted
|
||
case SelfClosingStartTag
|
||
case BogusComment
|
||
case MarkupDeclarationOpen
|
||
case CommentStart
|
||
case CommentStartDash
|
||
case Comment
|
||
case CommentLessThanSign
|
||
case CommentLessThanSignBang
|
||
case CommentLessThanSignBangDash
|
||
case CommentLessThanSignBangDashDash
|
||
case CommentEndDash
|
||
case CommentEnd
|
||
case CommentEndBang
|
||
case DOCTYPE
|
||
case BeforeDOCTYPEName
|
||
case DOCTYPEName
|
||
case AfterDOCTYPEName
|
||
case AfterDOCTYPEPublicKeyword
|
||
case BeforeDOCTYPEPublicIdentifier
|
||
case DOCTYPEPublicIdentifierDoubleQuoted
|
||
case DOCTYPEPublicIdentifierSingleQuoted
|
||
case AfterDOCTYPEPublicIdentifier
|
||
case BetweenDOCTYPEPublicAndSystemIdentifiers
|
||
case AfterDOCTYPESystemKeyword
|
||
case BeforeDOCTYPESystemIdentifier
|
||
case DOCTYPESystemIdentifierDoubleQuoted
|
||
case DOCTYPESystemIdentifierSingleQuoted
|
||
case AfterDOCTYPESystemIdentifier
|
||
case BogusDOCTYPE
|
||
case CDATASection
|
||
case CDATASectionBracket
|
||
case CDATASectionEnd
|
||
case CharacterReference
|
||
case NamedCharacterReference
|
||
case AmbiguousAmpersand
|
||
case NumericCharacterReference
|
||
case HexadecimalCharacterReferenceStart
|
||
case DecimalCharacterReferenceStart
|
||
case HexadecimalCharacterReference
|
||
case DecimalCharacterReference
|
||
case NumericCharacterReferenceEnd
|
||
}
|
||
|
||
private var input = Swift.String()
|
||
private var cursor: Swift.String.Index
|
||
private var previousCursor: Swift.String.Index
|
||
|
||
public private(set) var state = State.Data
|
||
private var returnState = State.Data
|
||
|
||
private var currentToken = HTMLToken()
|
||
private var queuedTokens = Deque<HTMLToken>()
|
||
|
||
private var currentBuilder = Swift.String()
|
||
private var temporaryBuffer = Swift.String()
|
||
private var lastStartTagName: Swift.String? = nil
|
||
private var currentTokensAttributes: [HTMLToken.Attribute]? = nil
|
||
private var currentAttribute: HTMLToken.Attribute? = nil
|
||
private var characterReferenceCode: Int = 0
|
||
|
||
private var aborted = false
|
||
private var hasEmittedEOF = false
|
||
|
||
// https://infra.spec.whatwg.org/#noncharacter
|
||
// A noncharacter is a code point that is in the range U+FDD0 to U+FDEF, inclusive,
|
||
// or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF,
|
||
// U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF,
|
||
// U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF,
|
||
// U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF.
|
||
static private var nonCharacter = CharacterSet(charactersIn: Unicode.Scalar(0xFDD0)!...Unicode.Scalar(0xFDEF)!)
|
||
.union(CharacterSet(charactersIn: "\u{FFFE}"..."\u{FFFF}"))
|
||
.union(CharacterSet(charactersIn: "\u{1FFFE}"..."\u{1FFFF}"))
|
||
.union(CharacterSet(charactersIn: "\u{2FFFE}"..."\u{2FFFF}"))
|
||
.union(CharacterSet(charactersIn: "\u{3FFFE}"..."\u{3FFFF}"))
|
||
.union(CharacterSet(charactersIn: "\u{4FFFE}"..."\u{4FFFF}"))
|
||
.union(CharacterSet(charactersIn: "\u{5FFFE}"..."\u{5FFFF}"))
|
||
.union(CharacterSet(charactersIn: "\u{6FFFE}"..."\u{6FFFF}"))
|
||
.union(CharacterSet(charactersIn: "\u{7FFFE}"..."\u{7FFFF}"))
|
||
.union(CharacterSet(charactersIn: "\u{8FFFE}"..."\u{8FFFF}"))
|
||
.union(CharacterSet(charactersIn: "\u{9FFFE}"..."\u{9FFFF}"))
|
||
.union(CharacterSet(charactersIn: "\u{AFFFE}"..."\u{AFFFF}"))
|
||
.union(CharacterSet(charactersIn: "\u{BFFFE}"..."\u{BFFFF}"))
|
||
.union(CharacterSet(charactersIn: "\u{CFFFE}"..."\u{CFFFF}"))
|
||
.union(CharacterSet(charactersIn: "\u{DFFFE}"..."\u{DFFFF}"))
|
||
.union(CharacterSet(charactersIn: "\u{EFFFE}"..."\u{EFFFF}"))
|
||
.union(CharacterSet(charactersIn: "\u{FFFFE}"..."\u{FFFFF}"))
|
||
.union(CharacterSet(charactersIn: "\u{10FFFE}"..."\u{10FFFF}"))
|
||
|
||
// https://infra.spec.whatwg.org/#ascii-whitespace
|
||
static private var asciiWhitespace = CharacterSet(charactersIn: "\t\n\u{000C}\u{000D} ")
|
||
|
||
// https://infra.spec.whatwg.org/#c0-control
|
||
static private var c0Control = CharacterSet(charactersIn: "\u{0000}"..."\u{001F}")
|
||
|
||
// https://infra.spec.whatwg.org/#control
|
||
static private var control = c0Control.union(CharacterSet(charactersIn: "\u{007F}"..."\u{009F}"))
|
||
|
||
// IMPLEMENTATION DEFINED: Used for the numeric character reference end state
|
||
static private var controlNotAsciiWhitespace = control.subtracting(asciiWhitespace)
|
||
|
||
// https://infra.spec.whatwg.org/#ascii-digit
|
||
static private var asciiDigit = CharacterSet(charactersIn: "0123456789")
|
||
|
||
// https://infra.spec.whatwg.org/#ascii-upper-hex-digit
|
||
static private var asciiUpperHexDigit = CharacterSet(charactersIn: "ABCDEF")
|
||
|
||
// https://infra.spec.whatwg.org/#ascii-lower-hex-digit
|
||
static private var asciiLowerHexDigit = CharacterSet(charactersIn: "abcdef")
|
||
|
||
// https://infra.spec.whatwg.org/#ascii-hex-digit
|
||
static private var asciiHexDigit = asciiUpperHexDigit.union(asciiLowerHexDigit)
|
||
|
||
// https://infra.spec.whatwg.org/#ascii-upper-alpha
|
||
static private var asciiUpperAlpha = CharacterSet(charactersIn: "ABCDEFGHIJKLMNOPQRSTUVWXYZ")
|
||
|
||
// https://infra.spec.whatwg.org/#ascii-lower-alpha
|
||
static private var asciiLowerAlpha = CharacterSet(charactersIn: "abcdefghijklmnopqrstuvwxyz")
|
||
|
||
// https://infra.spec.whatwg.org/#ascii-upper-alpha
|
||
static private var asciiAlpha = asciiUpperAlpha.union(asciiLowerAlpha)
|
||
|
||
// https://infra.spec.whatwg.org/#ascii-alphanumeric
|
||
static private var asciiAlphanumeric = asciiAlpha.union(asciiDigit)
|
||
|
||
static private var characterReferenceControlCodeMapping: [Int: Unicode.Scalar] =
|
||
[
|
||
0x80: Unicode.Scalar(0x20AC)!, // €
|
||
0x82: Unicode.Scalar(0x201A)!, // ‚
|
||
0x83: Unicode.Scalar(0x0192)!, // ƒ
|
||
0x84: Unicode.Scalar(0x201E)!, // „
|
||
0x85: Unicode.Scalar(0x2026)!, // …
|
||
0x86: Unicode.Scalar(0x2020)!, // †
|
||
0x87: Unicode.Scalar(0x2021)!, // ‡
|
||
0x88: Unicode.Scalar(0x02C6)!, // ˆ
|
||
0x89: Unicode.Scalar(0x2030)!, // ‰
|
||
0x8A: Unicode.Scalar(0x0160)!, // Š
|
||
0x8B: Unicode.Scalar(0x2039)!, // ‹
|
||
0x8C: Unicode.Scalar(0x0152)!, // Œ
|
||
0x8E: Unicode.Scalar(0x017D)!, // Ž
|
||
0x91: Unicode.Scalar(0x2018)!, // ‘
|
||
0x92: Unicode.Scalar(0x2019)!, // ’
|
||
0x93: Unicode.Scalar(0x201C)!, // “
|
||
0x94: Unicode.Scalar(0x201D)!, // ”
|
||
0x95: Unicode.Scalar(0x2022)!, // •
|
||
0x96: Unicode.Scalar(0x2013)!, // –
|
||
0x97: Unicode.Scalar(0x2014)!, // —
|
||
0x98: Unicode.Scalar(0x02DC)!, // ˜
|
||
0x99: Unicode.Scalar(0x2122)!, // ™
|
||
0x9A: Unicode.Scalar(0x0161)!, // š
|
||
0x9B: Unicode.Scalar(0x203A)!, // ›
|
||
0x9C: Unicode.Scalar(0x0153)!, // œ
|
||
0x9E: Unicode.Scalar(0x017E)!, // ž
|
||
0x9F: Unicode.Scalar(0x0178)!, // Ÿ
|
||
]
|
||
|
||
public init() {
|
||
self.cursor = self.input.startIndex
|
||
self.previousCursor = self.input.startIndex
|
||
}
|
||
public init?(input: AK.StringView, encoding: AK.StringView) {
|
||
if let string = Swift.String(decoding: input, as: encoding) {
|
||
self.input = string
|
||
} else {
|
||
return nil
|
||
}
|
||
self.cursor = self.input.startIndex
|
||
self.previousCursor = self.input.startIndex
|
||
}
|
||
|
||
public convenience init?(input: AK.StringView) {
|
||
self.init(input: input, encoding: "UTF-8")
|
||
}
|
||
|
||
public func abort() {
|
||
self.aborted = true
|
||
}
|
||
|
||
func skip(_ count: Int) {
|
||
self.previousCursor = self.cursor
|
||
self.cursor = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.endIndex) ?? input.endIndex
|
||
}
|
||
|
||
func peekCodePoint(_ offset: Int = 0) -> Character? {
|
||
guard let index = self.input.index(self.cursor, offsetBy: offset, limitedBy: self.input.index(before: self.input.endIndex)) else {
|
||
return nil
|
||
}
|
||
return self.input[index]
|
||
}
|
||
|
||
func peekNext(count: Int) -> Swift.Substring? {
|
||
guard let endIndex = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.index(before: self.input.endIndex)) else {
|
||
return nil
|
||
}
|
||
return self.input[self.cursor..<endIndex]
|
||
}
|
||
|
||
func nextCodePoint() -> Character? {
|
||
guard self.cursor < self.input.endIndex else {
|
||
return nil
|
||
}
|
||
|
||
// https://html.spec.whatwg.org/multipage/parsing.html#preprocessing-the-input-stream:tokenization
|
||
// https://infra.spec.whatwg.org/#normalize-newlines
|
||
var codePoint: Character
|
||
if let peeked = peekCodePoint(), let peekedNext = peekCodePoint(1), peeked == "\r", peekedNext == "\n" {
|
||
// replace every U+000D CR U+000A LF code point pair with a single U+000A LF code point,
|
||
skip(2)
|
||
codePoint = "\n"
|
||
} else if let peeked = peekCodePoint(), peeked == "\r" {
|
||
// replace every remaining U+000D CR code point with a U+000A LF code point.
|
||
skip(1)
|
||
codePoint = "\n"
|
||
} else {
|
||
skip(1)
|
||
codePoint = self.input[self.previousCursor]
|
||
}
|
||
return codePoint
|
||
}
|
||
|
||
func restoreCursorToPrevious() {
|
||
self.cursor = self.previousCursor
|
||
}
|
||
|
||
func createNewToken(_ token: HTMLToken) {
|
||
self.currentToken = token
|
||
if self.currentToken.isTag() {
|
||
self.currentTokensAttributes = []
|
||
}
|
||
// FIXME: Assign Position
|
||
}
|
||
|
||
enum AttributeStringBehavior {
|
||
case SetName
|
||
case SetValue
|
||
case IgnoreString
|
||
}
|
||
func finalizeCurrentAttribute(_ behavior: AttributeStringBehavior) {
|
||
precondition(self.currentAttribute != nil && self.currentTokensAttributes != nil)
|
||
switch behavior {
|
||
case .SetName:
|
||
self.currentAttribute!.localName = self.currentBuilder.takeString()
|
||
case .SetValue:
|
||
self.currentAttribute!.value = self.currentBuilder.takeString()
|
||
case .IgnoreString:
|
||
_ = self.currentBuilder.takeString()
|
||
}
|
||
self.currentTokensAttributes!.append(self.currentAttribute!)
|
||
self.currentAttribute = nil
|
||
}
|
||
|
||
enum NextTokenState {
|
||
case Emit(token: HTMLToken?)
|
||
case SwitchTo
|
||
case Reconsume(inputCharacter: Character?)
|
||
case ReprocessQueue
|
||
case Continue
|
||
}
|
||
|
||
public func nextToken(stopAtInsertionPoint: Bool = false) -> HTMLToken? {
|
||
|
||
let processQueue = { () -> HTMLToken?? in
|
||
if let token = self.queuedTokens.popFirst() {
|
||
return token
|
||
}
|
||
return self.aborted ? Optional(nil) : nil
|
||
}
|
||
|
||
if let maybeToken = processQueue() {
|
||
return maybeToken
|
||
}
|
||
|
||
var nextInputCharacter: Character? = nil
|
||
while true {
|
||
// FIXME: Handle insertion point
|
||
switch nextTokenImpl(nextInputCharacter) {
|
||
case .Emit(let token):
|
||
return token
|
||
case .SwitchTo, .Continue:
|
||
nextInputCharacter = nil
|
||
break
|
||
case .Reconsume(let character):
|
||
nextInputCharacter = character
|
||
break
|
||
case .ReprocessQueue:
|
||
if let maybeToken = processQueue() {
|
||
return maybeToken
|
||
}
|
||
nextInputCharacter = nil
|
||
break
|
||
}
|
||
}
|
||
}
|
||
|
||
func continueInCurrentState() -> NextTokenState {
|
||
return .Continue
|
||
}
|
||
|
||
func switchTo(_ state: State) -> NextTokenState {
|
||
self.state = state
|
||
return .SwitchTo
|
||
}
|
||
|
||
func reconsume(_ character: Character?, `in` state: State) -> NextTokenState {
|
||
self.state = state
|
||
return .Reconsume(inputCharacter: character)
|
||
}
|
||
|
||
func switchToReturnState() -> NextTokenState {
|
||
self.state = self.returnState
|
||
return .ReprocessQueue
|
||
}
|
||
|
||
func reconsumeInReturnState(_ character: Character?) -> NextTokenState {
|
||
self.state = self.returnState
|
||
if character != nil {
|
||
restoreCursorToPrevious()
|
||
}
|
||
return .ReprocessQueue
|
||
}
|
||
|
||
func switchToAndEmitCurrentToken(_ state: State) -> NextTokenState {
|
||
self.state = state
|
||
if self.currentToken.isTag() {
|
||
self.currentToken.attributes = self.currentTokensAttributes ?? []
|
||
self.currentTokensAttributes = nil
|
||
}
|
||
self.queuedTokens.append(self.currentToken)
|
||
self.currentToken = HTMLToken()
|
||
return .Emit(token: self.queuedTokens.popFirst()!)
|
||
}
|
||
|
||
func switchToAndEmitCharacter(_ state: State, character: Character) -> NextTokenState {
|
||
self.state = state
|
||
return emitCharacter(character)
|
||
}
|
||
|
||
func emitCharacterAndReconsume(_ character: Character, `in`: State, currentInputCharacter: Character?) -> NextTokenState {
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: character)))
|
||
self.state = `in`
|
||
return .Reconsume(inputCharacter: currentInputCharacter)
|
||
}
|
||
|
||
func emitEOF() -> NextTokenState {
|
||
if self.hasEmittedEOF {
|
||
return .Emit(token: nil)
|
||
}
|
||
self.hasEmittedEOF = true
|
||
createNewToken(HTMLToken(type: .EndOfFile))
|
||
self.queuedTokens.append(self.currentToken)
|
||
self.currentToken = HTMLToken()
|
||
return .Emit(token: self.queuedTokens.popFirst()!)
|
||
}
|
||
|
||
func emitCurrentTokenFollowedByEOF() -> NextTokenState {
|
||
precondition(!self.hasEmittedEOF)
|
||
if self.currentToken.isTag() {
|
||
self.currentToken.attributes = self.currentTokensAttributes ?? []
|
||
self.currentTokensAttributes = nil
|
||
}
|
||
self.queuedTokens.append(self.currentToken)
|
||
self.currentToken = HTMLToken()
|
||
return emitEOF()
|
||
}
|
||
|
||
func emitCharacter(_ character: Character) -> NextTokenState {
|
||
createNewToken(HTMLToken(type: .Character(codePoint: character)))
|
||
self.queuedTokens.append(self.currentToken)
|
||
self.currentToken = HTMLToken()
|
||
return .Emit(token: self.queuedTokens.popFirst()!)
|
||
}
|
||
|
||
func flushCodepointsConsumedAsACharacterReference() {
|
||
if consumedAsPartOfAnAttribute() {
|
||
self.currentBuilder += self.temporaryBuffer.takeString()
|
||
} else {
|
||
for codePoint in self.temporaryBuffer.takeString() {
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
|
||
}
|
||
}
|
||
}
|
||
|
||
func consumedAsPartOfAnAttribute() -> Bool {
|
||
return self.returnState == .AttributeValueDoubleQuoted || self.returnState == .AttributeValueSingleQuoted || self.returnState == .AttributeValueUnquoted
|
||
}
|
||
|
||
func isAppropriateEndTagToken(_ token: HTMLToken) -> Bool {
|
||
guard case let .EndTag(endTagName, _, _, _) = token.type else {
|
||
preconditionFailure("isAppropriateEndTagToken called with non-end-tag token")
|
||
}
|
||
if let startTagName = self.lastStartTagName {
|
||
return startTagName == endTagName
|
||
} else {
|
||
return false
|
||
}
|
||
}
|
||
|
||
func nextTokenImpl(_ nextInputCharacter: Character? = nil) -> NextTokenState {
|
||
let dontConsumeNextInputCharacter = {
|
||
self.restoreCursorToPrevious()
|
||
}
|
||
let _ = dontConsumeNextInputCharacter
|
||
|
||
// Handle reconsume by passing the character around in the state enum
|
||
let currentInputCharacter = nextInputCharacter ?? nextCodePoint()
|
||
|
||
switch self.state {
|
||
// 13.2.5.1 Data state, https://html.spec.whatwg.org/multipage/parsing.html#data-state
|
||
case .Data:
|
||
precondition(currentTokensAttributes == nil)
|
||
switch currentInputCharacter {
|
||
case "&":
|
||
self.returnState = .Data
|
||
return switchTo(.CharacterReference)
|
||
case "<":
|
||
return switchTo(.TagOpen)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
return emitCharacter("\u{FFFD}")
|
||
case nil:
|
||
return emitEOF()
|
||
default:
|
||
return emitCharacter(currentInputCharacter!)
|
||
}
|
||
|
||
// 13.2.5.2 RCDATA state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
|
||
case .RCDATA:
|
||
switch currentInputCharacter {
|
||
case "&":
|
||
self.returnState = .RCDATA
|
||
return switchTo(.CharacterReference)
|
||
case "<":
|
||
return switchTo(.RCDATALessThanSign)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
return emitCharacter("\u{FFFD}")
|
||
case nil:
|
||
return emitEOF()
|
||
default:
|
||
return emitCharacter(currentInputCharacter!)
|
||
}
|
||
|
||
// 13.2.5.3. RAWTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-state
|
||
case .RAWTEXT:
|
||
switch currentInputCharacter {
|
||
case "<":
|
||
return switchTo(.RAWTEXTLessThanSign)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
return emitCharacter("\u{FFFD}")
|
||
case nil:
|
||
return emitEOF()
|
||
default:
|
||
return emitCharacter(currentInputCharacter!)
|
||
}
|
||
// 13.2.5.4 Script data state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-state
|
||
case .ScriptData:
|
||
switch currentInputCharacter {
|
||
case "<":
|
||
return switchTo(.ScriptDataLessThanSign)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
return emitCharacter("\u{FFFD}")
|
||
case nil:
|
||
return emitEOF()
|
||
default:
|
||
return emitCharacter(currentInputCharacter!)
|
||
}
|
||
// 13.2.5.5 PLAINTEXT state, https://html.spec.whatwg.org/multipage/parsing.html#plaintext-state
|
||
case .PLAINTEXT:
|
||
switch currentInputCharacter {
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
return emitCharacter("\u{FFFD}")
|
||
case nil:
|
||
return emitEOF()
|
||
default:
|
||
return emitCharacter(currentInputCharacter!)
|
||
}
|
||
// 13.2.5.6 Tag open state https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
|
||
case .TagOpen:
|
||
switch currentInputCharacter {
|
||
case "!":
|
||
return switchTo(.MarkupDeclarationOpen)
|
||
case "/":
|
||
return switchTo(.EndTagOpen)
|
||
case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
|
||
createNewToken(HTMLToken(type: .StartTag(tagName: "")))
|
||
return reconsume(currentInputCharacter!, in: .TagName)
|
||
case "?":
|
||
// FIXME: log_parse_error()
|
||
createNewToken(HTMLToken(type: .Comment(data: "")))
|
||
return reconsume(currentInputCharacter!, in: .BogusComment)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
|
||
return emitEOF()
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
|
||
return reconsume(currentInputCharacter!, in: .Data)
|
||
}
|
||
// 13.2.5.7 End tag open state, https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
|
||
case .EndTagOpen:
|
||
switch currentInputCharacter {
|
||
case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
|
||
createNewToken(HTMLToken(type: .EndTag(tagName: "")))
|
||
return reconsume(currentInputCharacter!, in: .TagName)
|
||
default:
|
||
return emitEOF()
|
||
}
|
||
// 13.2.5.8 Tag name state, https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
|
||
case .TagName:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
self.currentToken.tagName = self.currentBuilder.takeString()
|
||
return switchTo(.BeforeAttributeName)
|
||
case "/":
|
||
self.currentToken.tagName = self.currentBuilder.takeString()
|
||
return switchTo(.SelfClosingStartTag)
|
||
case ">":
|
||
self.currentToken.tagName = self.currentBuilder.takeString()
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
|
||
currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
|
||
return continueInCurrentState()
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
currentBuilder += "\u{FFFD}"
|
||
return continueInCurrentState()
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
return emitEOF()
|
||
default:
|
||
currentBuilder.append(currentInputCharacter!)
|
||
return continueInCurrentState()
|
||
}
|
||
// 13.2.5.9 RCDATA less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-less-than-sign-state
|
||
case .RCDATALessThanSign:
|
||
switch currentInputCharacter {
|
||
case "/":
|
||
self.temporaryBuffer = ""
|
||
return switchTo(.RCDATAEndTagOpen)
|
||
default:
|
||
return emitCharacterAndReconsume("<", in: .RCDATA, currentInputCharacter: currentInputCharacter)
|
||
}
|
||
// 13.2.5.10 RCDATA end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-open-state
|
||
case .RCDATAEndTagOpen:
|
||
switch currentInputCharacter {
|
||
case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
|
||
createNewToken(HTMLToken(type: .EndTag(tagName: "")))
|
||
return reconsume(currentInputCharacter!, in: .RCDATAEndTagName)
|
||
default:
|
||
queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
|
||
queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
|
||
return reconsume(currentInputCharacter, in: .RCDATA)
|
||
}
|
||
// 13.2.5.11 RCDATA end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rcdata-end-tag-name-state
|
||
case .RCDATAEndTagName:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
if self.isAppropriateEndTagToken(currentToken) {
|
||
return switchTo(.BeforeAttributeName)
|
||
}
|
||
break
|
||
case "/":
|
||
if self.isAppropriateEndTagToken(currentToken) {
|
||
return switchTo(.SelfClosingStartTag)
|
||
}
|
||
break
|
||
case ">":
|
||
if self.isAppropriateEndTagToken(currentToken) {
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
}
|
||
break
|
||
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
|
||
self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
|
||
self.temporaryBuffer.append(c)
|
||
return continueInCurrentState()
|
||
case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
|
||
self.currentBuilder.append(c)
|
||
self.temporaryBuffer.append(c)
|
||
return continueInCurrentState()
|
||
default:
|
||
break
|
||
}
|
||
|
||
// First three steps fall through to the "anything else" block
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
|
||
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
|
||
self.currentBuilder = ""
|
||
for codePoint in self.temporaryBuffer {
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
|
||
}
|
||
return reconsume(currentInputCharacter, in: .RCDATA)
|
||
// 13.2.5.12 RAWTEXT less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state
|
||
case .RAWTEXTLessThanSign:
|
||
switch currentInputCharacter {
|
||
case "/":
|
||
self.temporaryBuffer = ""
|
||
return switchTo(.RAWTEXTEndTagOpen)
|
||
default:
|
||
return emitCharacterAndReconsume("<", in: .RAWTEXT, currentInputCharacter: currentInputCharacter)
|
||
}
|
||
// 13.2.5.13 RAWTEXT end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state
|
||
case .RAWTEXTEndTagOpen:
|
||
switch currentInputCharacter {
|
||
case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
|
||
createNewToken(HTMLToken(type: .EndTag(tagName: "")))
|
||
return reconsume(currentInputCharacter!, in: .RAWTEXTEndTagName)
|
||
default:
|
||
queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
|
||
queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
|
||
return reconsume(currentInputCharacter, in: .RAWTEXT)
|
||
}
|
||
// 13.2.5.14 RAWTEXT end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state
|
||
case .RAWTEXTEndTagName:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
if self.isAppropriateEndTagToken(currentToken) {
|
||
return switchTo(.BeforeAttributeName)
|
||
}
|
||
break
|
||
case "/":
|
||
if self.isAppropriateEndTagToken(currentToken) {
|
||
return switchTo(.SelfClosingStartTag)
|
||
}
|
||
break
|
||
case ">":
|
||
if self.isAppropriateEndTagToken(currentToken) {
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
}
|
||
break
|
||
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
|
||
self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
|
||
self.temporaryBuffer.append(c)
|
||
return continueInCurrentState()
|
||
case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
|
||
self.currentBuilder.append(c)
|
||
self.temporaryBuffer.append(c)
|
||
return continueInCurrentState()
|
||
default:
|
||
break
|
||
}
|
||
|
||
// First three steps fall through to the "anything else" block
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
|
||
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
|
||
self.currentBuilder = ""
|
||
for codePoint in self.temporaryBuffer {
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
|
||
}
|
||
return reconsume(currentInputCharacter, in: .RAWTEXT)
|
||
// 13.2.5.15 Script data less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state
|
||
case .ScriptDataLessThanSign:
|
||
switch currentInputCharacter {
|
||
case "/":
|
||
self.temporaryBuffer = ""
|
||
return switchTo(.ScriptDataEndTagOpen)
|
||
case "!":
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "!")))
|
||
return switchTo(.ScriptDataEscapeStart)
|
||
default:
|
||
return emitCharacterAndReconsume("<", in: .ScriptData, currentInputCharacter: currentInputCharacter)
|
||
}
|
||
// 13.2.5.16 Script data end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-open-state
|
||
case .ScriptDataEndTagOpen:
|
||
switch currentInputCharacter {
|
||
case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
|
||
createNewToken(HTMLToken(type: .EndTag(tagName: "")))
|
||
return reconsume(currentInputCharacter!, in: .ScriptDataEndTagName)
|
||
default:
|
||
queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
|
||
queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
|
||
return reconsume(currentInputCharacter, in: .ScriptData)
|
||
}
|
||
// 13.2.5.17 Script data end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-end-tag-name-state
|
||
case .ScriptDataEndTagName:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
if self.isAppropriateEndTagToken(currentToken) {
|
||
return switchTo(.BeforeAttributeName)
|
||
}
|
||
break
|
||
case "/":
|
||
if self.isAppropriateEndTagToken(currentToken) {
|
||
return switchTo(.SelfClosingStartTag)
|
||
}
|
||
break
|
||
case ">":
|
||
if self.isAppropriateEndTagToken(currentToken) {
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
}
|
||
break
|
||
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
|
||
self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
|
||
self.temporaryBuffer.append(c)
|
||
return continueInCurrentState()
|
||
case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
|
||
self.currentBuilder.append(c)
|
||
self.temporaryBuffer.append(c)
|
||
return continueInCurrentState()
|
||
default:
|
||
break
|
||
}
|
||
|
||
// First three steps fall through to the "anything else" block
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
|
||
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
|
||
self.currentBuilder = ""
|
||
for codePoint in self.temporaryBuffer {
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
|
||
}
|
||
return reconsume(currentInputCharacter, in: .ScriptData)
|
||
// 13.2.5.18 Script data escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-state
|
||
case .ScriptDataEscapeStart:
|
||
switch currentInputCharacter {
|
||
case "-":
|
||
return switchToAndEmitCharacter(.ScriptDataEscapeStartDash, character: "-")
|
||
default:
|
||
return reconsume(currentInputCharacter, in: .ScriptData)
|
||
}
|
||
// 13.2.5.19 Script data escape start dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escape-start-dash-state
|
||
case .ScriptDataEscapeStartDash:
|
||
switch currentInputCharacter {
|
||
case "-":
|
||
return switchToAndEmitCharacter(.ScriptDataEscapedDashDash, character: "-")
|
||
default:
|
||
return reconsume(currentInputCharacter, in: .ScriptData)
|
||
}
|
||
// 13.2.5.20 Script data escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-state
|
||
case .ScriptDataEscaped:
|
||
switch currentInputCharacter {
|
||
case "-":
|
||
return switchToAndEmitCharacter(.ScriptDataEscapedDash, character: "-")
|
||
case "<":
|
||
return switchTo(.ScriptDataEscapedLessThanSign)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
return emitCharacter("\u{FFFD}")
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
return emitEOF()
|
||
default:
|
||
return emitCharacter(currentInputCharacter!)
|
||
}
|
||
// 13.2.5.21 Script data escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-state
|
||
case .ScriptDataEscapedDash:
|
||
switch currentInputCharacter {
|
||
case "-":
|
||
return switchToAndEmitCharacter(.ScriptDataEscapedDashDash, character: "-")
|
||
case "<":
|
||
return switchTo(.ScriptDataEscapedLessThanSign)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
return switchToAndEmitCharacter(.ScriptDataEscaped, character: "\u{FFFD}")
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
return emitEOF()
|
||
default:
|
||
return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
|
||
}
|
||
// 13.2.5.22 Script data escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-dash-dash-state
|
||
case .ScriptDataEscapedDashDash:
|
||
switch currentInputCharacter {
|
||
case "-":
|
||
return emitCharacter("-")
|
||
case "<":
|
||
return switchTo(.ScriptDataEscapedLessThanSign)
|
||
case ">":
|
||
return switchToAndEmitCharacter(.ScriptData, character: ">")
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
return switchToAndEmitCharacter(.ScriptDataEscaped, character: "\u{FFFD}")
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
return emitEOF()
|
||
default:
|
||
return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
|
||
}
|
||
// 13.2.5.23 Script data escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-less-than-sign-state
|
||
case .ScriptDataEscapedLessThanSign:
|
||
switch currentInputCharacter {
|
||
case "/":
|
||
self.temporaryBuffer = ""
|
||
return switchTo(.ScriptDataEscapedEndTagOpen)
|
||
case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
|
||
self.temporaryBuffer = ""
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
|
||
return reconsume(currentInputCharacter!, in: .ScriptDataDoubleEscapeStart)
|
||
default:
|
||
return emitCharacterAndReconsume("<", in: .ScriptDataEscaped, currentInputCharacter: currentInputCharacter)
|
||
}
|
||
// 13.2.5.24 Script data escaped end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-open-state
|
||
case .ScriptDataEscapedEndTagOpen:
|
||
switch currentInputCharacter {
|
||
case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!):
|
||
createNewToken(HTMLToken(type: .EndTag(tagName: "")))
|
||
return reconsume(currentInputCharacter!, in: .ScriptDataEscapedEndTagName)
|
||
default:
|
||
queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
|
||
queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
|
||
return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
|
||
}
|
||
// 13.2.5.25 Script data escaped end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-escaped-end-tag-name-state
|
||
case .ScriptDataEscapedEndTagName:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
if self.isAppropriateEndTagToken(currentToken) {
|
||
return switchTo(.BeforeAttributeName)
|
||
}
|
||
break
|
||
case "/":
|
||
if self.isAppropriateEndTagToken(currentToken) {
|
||
return switchTo(.SelfClosingStartTag)
|
||
}
|
||
break
|
||
case ">":
|
||
if self.isAppropriateEndTagToken(currentToken) {
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
}
|
||
break
|
||
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
|
||
self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
|
||
self.temporaryBuffer.append(c)
|
||
return continueInCurrentState()
|
||
case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
|
||
self.currentBuilder.append(c)
|
||
self.temporaryBuffer.append(c)
|
||
return continueInCurrentState()
|
||
default:
|
||
break
|
||
}
|
||
|
||
// First three steps fall through to the "anything else" block
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<")))
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/")))
|
||
// NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case.
|
||
self.currentBuilder = ""
|
||
for codePoint in self.temporaryBuffer {
|
||
self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint)))
|
||
}
|
||
return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
|
||
// 13.2.5.26 Script data double escape start state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-start-state
|
||
case .ScriptDataDoubleEscapeStart:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ", "/", ">":
|
||
if self.temporaryBuffer == "script" {
|
||
return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
|
||
} else {
|
||
return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
|
||
}
|
||
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
|
||
self.temporaryBuffer.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
|
||
return emitCharacter(currentInputCharacter!)
|
||
case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
|
||
self.temporaryBuffer.append(c)
|
||
return emitCharacter(currentInputCharacter!)
|
||
default:
|
||
return reconsume(currentInputCharacter, in: .ScriptDataEscaped)
|
||
}
|
||
// 13.2.5.27 Script data double escaped state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-state
|
||
case .ScriptDataDoubleEscaped:
|
||
switch currentInputCharacter {
|
||
case "-":
|
||
return switchToAndEmitCharacter(.ScriptDataDoubleEscapedDash, character: "-")
|
||
case "<":
|
||
return switchTo(.ScriptDataDoubleEscapedLessThanSign)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
return emitCharacter("\u{FFFD}")
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
return emitEOF()
|
||
default:
|
||
return emitCharacter(currentInputCharacter!)
|
||
}
|
||
// 13.2.5.28 Script data double escaped dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-state
|
||
case .ScriptDataDoubleEscapedDash:
|
||
switch currentInputCharacter {
|
||
case "-":
|
||
return switchToAndEmitCharacter(.ScriptDataDoubleEscapedDashDash, character: "-")
|
||
case "<":
|
||
return switchTo(.ScriptDataDoubleEscapedLessThanSign)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: "\u{FFFD}")
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
return emitEOF()
|
||
default:
|
||
return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
|
||
}
|
||
// 13.2.5.29 Script data double escaped dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-dash-dash-state
|
||
case .ScriptDataDoubleEscapedDashDash:
|
||
switch currentInputCharacter {
|
||
case "-":
|
||
return emitCharacter("-")
|
||
case "<":
|
||
return switchToAndEmitCharacter(.ScriptDataDoubleEscapedLessThanSign, character: "<")
|
||
case ">":
|
||
return switchToAndEmitCharacter(.ScriptData, character: ">")
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: "\u{FFFD}")
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
return emitEOF()
|
||
default:
|
||
return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
|
||
}
|
||
// 13.2.5.30 Script data double escaped less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escaped-less-than-sign-state
|
||
case .ScriptDataDoubleEscapedLessThanSign:
|
||
switch currentInputCharacter {
|
||
case "/":
|
||
self.temporaryBuffer = ""
|
||
return switchToAndEmitCharacter(.ScriptDataDoubleEscapeEnd, character: "/")
|
||
default:
|
||
return reconsume(currentInputCharacter, in: .ScriptDataDoubleEscaped)
|
||
}
|
||
// 13.2.5.31 Script data double escape end state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-double-escape-end-state
|
||
case .ScriptDataDoubleEscapeEnd:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ", "/", ">":
|
||
if self.temporaryBuffer == "script" {
|
||
return switchToAndEmitCharacter(.ScriptDataEscaped, character: currentInputCharacter!)
|
||
} else {
|
||
return switchToAndEmitCharacter(.ScriptDataDoubleEscaped, character: currentInputCharacter!)
|
||
}
|
||
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
|
||
self.temporaryBuffer.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
|
||
return emitCharacter(currentInputCharacter!)
|
||
case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!):
|
||
self.temporaryBuffer.append(c)
|
||
return emitCharacter(currentInputCharacter!)
|
||
default:
|
||
return reconsume(currentInputCharacter, in: .ScriptDataDoubleEscaped)
|
||
}
|
||
// 13.2.5.32 Before attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
|
||
case .BeforeAttributeName:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
return continueInCurrentState()
|
||
case "/", ">", nil:
|
||
return reconsume(currentInputCharacter, in: .AfterAttributeName)
|
||
case "=":
|
||
// FIXME: log_parse_error()
|
||
self.currentBuilder = Swift.String(currentInputCharacter!)
|
||
self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
|
||
return switchTo(.AttributeName)
|
||
default:
|
||
self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
|
||
return reconsume(currentInputCharacter!, in: .AttributeName)
|
||
}
|
||
// 13.2.5.33 Attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
|
||
case .AttributeName:
|
||
// FIXME: When the user agent leaves the attribute name state (and before emitting the tag token, if appropriate),
|
||
// the complete attribute's name must be compared to the other attributes on the same token;
|
||
// if there is already an attribute on the token with the exact same name, then this is a duplicate-attribute
|
||
// parse error and the new attribute must be removed from the token.
|
||
// NOTE: If an attribute is so removed from a token, it, and the value that gets associated with it, if any,
|
||
// are never subsequently used by the parser, and are therefore effectively discarded. Removing the attribute
|
||
// in this way does not change its status as the "current attribute" for the purposes of the tokenizer, however.
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ", "/", ">", nil:
|
||
// FIXME: set name position
|
||
self.currentAttribute!.localName = self.currentBuilder.takeString()
|
||
return reconsume(currentInputCharacter, in: .AfterAttributeName)
|
||
case "=":
|
||
// FIXME: set name position
|
||
self.currentAttribute!.localName = self.currentBuilder.takeString()
|
||
return switchTo(.BeforeAttributeValue)
|
||
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
|
||
self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
|
||
return continueInCurrentState()
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
self.currentBuilder.append("\u{FFFD}")
|
||
return continueInCurrentState()
|
||
default:
|
||
self.currentBuilder.append(currentInputCharacter!)
|
||
return continueInCurrentState()
|
||
}
|
||
// 13.2.5.34 After attribute name state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
|
||
case .AfterAttributeName:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
return continueInCurrentState()
|
||
case "/":
|
||
self.finalizeCurrentAttribute(.SetName)
|
||
return switchTo(.SelfClosingStartTag)
|
||
case "=":
|
||
self.finalizeCurrentAttribute(.SetName)
|
||
return switchTo(.BeforeAttributeValue)
|
||
case ">":
|
||
self.finalizeCurrentAttribute(.SetName)
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.finalizeCurrentAttribute(.IgnoreString)
|
||
return emitEOF()
|
||
default:
|
||
self.finalizeCurrentAttribute(.SetName)
|
||
self.currentAttribute = HTMLToken.Attribute(localName: "", value: "")
|
||
return reconsume(currentInputCharacter!, in: .AttributeName)
|
||
}
|
||
// 13.2.5.35 Before attribute value state, https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
|
||
case .BeforeAttributeValue:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
return continueInCurrentState()
|
||
case "\"":
|
||
return switchTo(.AttributeValueDoubleQuoted)
|
||
case "'":
|
||
return switchTo(.AttributeValueSingleQuoted)
|
||
case ">":
|
||
// FIXME: log_parse_error()
|
||
self.finalizeCurrentAttribute(.IgnoreString)
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
default:
|
||
return reconsume(currentInputCharacter, in: .AttributeValueUnquoted)
|
||
}
|
||
// 13.2.5.36 Attribute value (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-double-quoted-state
|
||
case .AttributeValueDoubleQuoted:
|
||
switch currentInputCharacter {
|
||
case "\"":
|
||
return switchTo(.AfterAttributeValueQuoted)
|
||
case "&":
|
||
self.returnState = .AttributeValueDoubleQuoted
|
||
return switchTo(.CharacterReference)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
self.currentBuilder.append("\u{FFFD}")
|
||
return continueInCurrentState()
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.finalizeCurrentAttribute(.IgnoreString)
|
||
return emitEOF()
|
||
default:
|
||
self.currentBuilder.append(currentInputCharacter!)
|
||
return continueInCurrentState()
|
||
}
|
||
// 13.2.5.37 Attribute value (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-single-quoted-state
|
||
case .AttributeValueSingleQuoted:
|
||
switch currentInputCharacter {
|
||
case "'":
|
||
return switchTo(.AfterAttributeValueQuoted)
|
||
case "&":
|
||
self.returnState = .AttributeValueSingleQuoted
|
||
return switchTo(.CharacterReference)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
self.currentBuilder.append("\u{FFFD}")
|
||
return continueInCurrentState()
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
return emitEOF()
|
||
default:
|
||
self.currentBuilder.append(currentInputCharacter!)
|
||
return continueInCurrentState()
|
||
}
|
||
// 13.2.5.38 Attribute value (unquoted) state, https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-unquoted-state
|
||
case .AttributeValueUnquoted:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
self.finalizeCurrentAttribute(.SetValue)
|
||
return switchTo(.BeforeAttributeName)
|
||
case "&":
|
||
self.returnState = .AttributeValueUnquoted
|
||
return switchTo(.CharacterReference)
|
||
case ">":
|
||
self.finalizeCurrentAttribute(.SetValue)
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
self.currentBuilder.append("\u{FFFD}")
|
||
return continueInCurrentState()
|
||
case "\"", "'", "<", "=", "`":
|
||
// FIXME: log_parse_error()
|
||
self.currentBuilder.append(currentInputCharacter!)
|
||
return continueInCurrentState()
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.finalizeCurrentAttribute(.IgnoreString)
|
||
return emitEOF()
|
||
default:
|
||
self.currentBuilder.append(currentInputCharacter!)
|
||
return continueInCurrentState()
|
||
}
|
||
// 13.2.5.39 After attribute value (quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-quoted-state
|
||
case .AfterAttributeValueQuoted:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
self.finalizeCurrentAttribute(.SetValue)
|
||
return switchTo(.BeforeAttributeName)
|
||
case "/":
|
||
self.finalizeCurrentAttribute(.SetValue)
|
||
return switchTo(.SelfClosingStartTag)
|
||
case ">":
|
||
self.finalizeCurrentAttribute(.SetValue)
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.finalizeCurrentAttribute(.IgnoreString)
|
||
return emitEOF()
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
self.finalizeCurrentAttribute(.SetValue)
|
||
return reconsume(currentInputCharacter!, in: .BeforeAttributeName)
|
||
}
|
||
// 13.2.5.40 Self-closing start tag state, https://html.spec.whatwg.org/multipage/parsing.html#self-closing-start-tag-state
|
||
case .SelfClosingStartTag:
|
||
switch currentInputCharacter {
|
||
case ">":
|
||
self.currentToken.selfClosing = true
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
return emitEOF()
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
return reconsume(currentInputCharacter!, in: .BeforeAttributeName)
|
||
}
|
||
// 13.2.5.41 Bogus comment state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
|
||
case .BogusComment:
|
||
switch currentInputCharacter {
|
||
case ">":
|
||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||
return emitCurrentTokenFollowedByEOF()
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
currentBuilder.append("\u{FFFD}")
|
||
return continueInCurrentState()
|
||
default:
|
||
self.currentBuilder.append(currentInputCharacter!)
|
||
return continueInCurrentState()
|
||
}
|
||
// 13.2.5.42 Markup declaration open state, https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
|
||
case .MarkupDeclarationOpen:
|
||
dontConsumeNextInputCharacter()
|
||
if let nextTwo = peekNext(count: 2), nextTwo == "--" {
|
||
skip(2)
|
||
return switchTo(.CommentStart)
|
||
} else if let nextSeven = peekNext(count: 7), nextSeven.uppercased() == "DOCTYPE" {
|
||
skip(7)
|
||
return switchTo(.DOCTYPE)
|
||
} else if let nextSeven = peekNext(count: 7), nextSeven.uppercased() == "[CDATA[" {
|
||
skip(7)
|
||
// FIXME: If there is an adjusted current node and it is not an element in the HTML namespace,
|
||
// then switch to the CDATA section state.
|
||
// FIXME: log_parse_error()
|
||
self.currentBuilder = "[CDATA["
|
||
self.currentToken = HTMLToken(type: .Comment(data: ""))
|
||
return switchTo(.BogusComment)
|
||
} else {
|
||
// FIXME: log_parse_error()
|
||
self.currentToken = HTMLToken(type: .Comment(data: ""))
|
||
return switchTo(.BogusComment)
|
||
}
|
||
// 13.2.5.43 Comment start state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-state
|
||
case .CommentStart:
|
||
switch currentInputCharacter {
|
||
case "-":
|
||
return switchTo(.CommentStartDash)
|
||
case ">":
|
||
// FIXME: log_parse_error()
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
default:
|
||
return reconsume(currentInputCharacter, in: .Comment)
|
||
}
|
||
// 13.2.5.44 Comment start dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-start-dash-state
|
||
case .CommentStartDash:
|
||
switch currentInputCharacter {
|
||
case "-":
|
||
return switchTo(.CommentEnd)
|
||
case ">":
|
||
// FIXME: log_parse_error()
|
||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
currentBuilder.append("-")
|
||
return reconsume(currentInputCharacter, in: .Comment)
|
||
}
|
||
// 13.2.5.45 Comment state, https://html.spec.whatwg.org/multipage/parsing.html#comment-state
|
||
case .Comment:
|
||
switch currentInputCharacter {
|
||
case "<":
|
||
currentBuilder.append("<")
|
||
return switchTo(.CommentLessThanSign)
|
||
case "-":
|
||
return switchTo(.CommentEndDash)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
currentBuilder.append("\u{FFFD}")
|
||
return continueInCurrentState()
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
currentBuilder.append(currentInputCharacter!)
|
||
return continueInCurrentState()
|
||
}
|
||
// 13.2.5.46 Comment less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-state
|
||
case .CommentLessThanSign:
|
||
switch currentInputCharacter {
|
||
case "!":
|
||
currentBuilder.append(currentInputCharacter!)
|
||
return switchTo(.CommentLessThanSignBang)
|
||
case "<":
|
||
currentBuilder.append(currentInputCharacter!)
|
||
return continueInCurrentState()
|
||
default:
|
||
return reconsume(currentInputCharacter, in: .Comment)
|
||
}
|
||
// 13.2.5.47 Comment less-than sign bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-state
|
||
case .CommentLessThanSignBang:
|
||
switch currentInputCharacter {
|
||
case "-":
|
||
return switchTo(.CommentLessThanSignBangDash)
|
||
default:
|
||
return reconsume(currentInputCharacter, in: .Comment)
|
||
}
|
||
// 13.2.5.48 Comment less-than sign bang dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-state
|
||
case .CommentLessThanSignBangDash:
|
||
switch currentInputCharacter {
|
||
case "-":
|
||
return switchTo(.CommentLessThanSignBangDashDash)
|
||
default:
|
||
return reconsume(currentInputCharacter, in: .CommentEndDash)
|
||
}
|
||
// 13.2.5.49 Comment less-than sign bang dash dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-less-than-sign-bang-dash-dash-state
|
||
case .CommentLessThanSignBangDashDash:
|
||
switch currentInputCharacter {
|
||
case ">", nil:
|
||
return reconsume(currentInputCharacter, in: .CommentEnd)
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
return reconsume(currentInputCharacter, in: .CommentEnd)
|
||
}
|
||
// 13.2.5.50 Comment end dash state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
|
||
case .CommentEndDash:
|
||
switch currentInputCharacter {
|
||
case "-":
|
||
return switchTo(.CommentEnd)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
currentBuilder.append("-")
|
||
return reconsume(currentInputCharacter, in: .Comment)
|
||
}
|
||
// 13.2.5.51 Comment end state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
|
||
case .CommentEnd:
|
||
switch currentInputCharacter {
|
||
case ">":
|
||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case "!":
|
||
return switchTo(.CommentEndBang)
|
||
case "-":
|
||
currentBuilder.append("-")
|
||
return continueInCurrentState()
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
currentBuilder.append("--")
|
||
return reconsume(currentInputCharacter, in: .Comment)
|
||
}
|
||
// 13.2.5.52 Comment end bang state, https://html.spec.whatwg.org/multipage/parsing.html#comment-end-bang-state
|
||
case .CommentEndBang:
|
||
switch currentInputCharacter {
|
||
case "-":
|
||
currentBuilder.append("--!")
|
||
return switchTo(.CommentEndDash)
|
||
case ">":
|
||
// FIXME: log_parse_error()
|
||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
currentToken = HTMLToken(type: .Comment(data: currentBuilder.takeString()))
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
currentBuilder.append("--!")
|
||
return reconsume(currentInputCharacter, in: .Comment)
|
||
}
|
||
// 13.2.5.53 DOCTYPE state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
|
||
case .DOCTYPE:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
return switchTo(.BeforeDOCTYPEName)
|
||
case ">":
|
||
return reconsume(currentInputCharacter, in: .BeforeDOCTYPEName)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: true))
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
return reconsume(currentInputCharacter!, in: .BeforeDOCTYPEName)
|
||
}
|
||
// 13.2.5.54 Before DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
|
||
case .BeforeDOCTYPEName:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
return continueInCurrentState()
|
||
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
|
||
precondition(self.currentBuilder.isEmpty)
|
||
self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: false))
|
||
self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
|
||
return switchTo(.DOCTYPEName)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
precondition(self.currentBuilder.isEmpty)
|
||
self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: false))
|
||
self.currentBuilder.append("\u{FFFD}")
|
||
return switchTo(.DOCTYPEName)
|
||
case ">":
|
||
// FIXME: log_parse_error()
|
||
self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: true))
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: true))
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
precondition(self.currentBuilder.isEmpty)
|
||
self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: false))
|
||
self.currentBuilder.append(currentInputCharacter!)
|
||
return switchTo(.DOCTYPEName)
|
||
}
|
||
// 13.2.5.55 DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
|
||
case .DOCTYPEName:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
self.currentToken.name = self.currentBuilder.takeString()
|
||
return switchTo(.AfterDOCTYPEName)
|
||
case ">":
|
||
self.currentToken.name = self.currentBuilder.takeString()
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!):
|
||
self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20)))
|
||
return continueInCurrentState()
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
self.currentBuilder.append("\u{FFFD}")
|
||
return continueInCurrentState()
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
self.currentToken.name = self.currentBuilder.takeString()
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
self.currentBuilder.append(currentInputCharacter!)
|
||
return continueInCurrentState()
|
||
}
|
||
// 13.2.5.56 After DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state
|
||
case .AfterDOCTYPEName:
|
||
precondition(self.currentBuilder.isEmpty)
|
||
precondition(self.currentToken.name != nil)
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
return continueInCurrentState()
|
||
case ">":
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
if "pP".contains(currentInputCharacter!), peekNext(count: 5)?.uppercased() == "UBLIC" {
|
||
skip(5)
|
||
return switchTo(.AfterDOCTYPEPublicKeyword)
|
||
}
|
||
if "sS".contains(currentInputCharacter!), peekNext(count: 5)?.uppercased() == "YSTEM" {
|
||
skip(5)
|
||
return switchTo(.AfterDOCTYPESystemKeyword)
|
||
}
|
||
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
|
||
}
|
||
// 13.2.5.57 After DOCTYPE public keyword state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state
|
||
case .AfterDOCTYPEPublicKeyword:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
return switchTo(.BeforeDOCTYPEPublicIdentifier)
|
||
case "\"":
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.publicIdentifier = ""
|
||
return switchTo(.DOCTYPEPublicIdentifierDoubleQuoted)
|
||
case "'":
|
||
self.currentToken.publicIdentifier = ""
|
||
return switchTo(.DOCTYPEPublicIdentifierSingleQuoted)
|
||
case ">":
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
|
||
}
|
||
// 13.2.5.58 Before DOCTYPE public identifier state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state
|
||
case .BeforeDOCTYPEPublicIdentifier:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
return continueInCurrentState()
|
||
case "\"":
|
||
self.currentToken.publicIdentifier = ""
|
||
return switchTo(.DOCTYPEPublicIdentifierDoubleQuoted)
|
||
case "'":
|
||
self.currentToken.publicIdentifier = ""
|
||
return switchTo(.DOCTYPEPublicIdentifierSingleQuoted)
|
||
case ">":
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
|
||
}
|
||
// 13.2.5.59 DOCTYPE public identifier (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-double-quoted-state
|
||
case .DOCTYPEPublicIdentifierDoubleQuoted:
|
||
switch currentInputCharacter {
|
||
case "\"":
|
||
self.currentToken.publicIdentifier = self.currentBuilder.takeString()
|
||
return switchTo(.AfterDOCTYPEPublicIdentifier)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
self.currentBuilder.append("\u{FFFD}")
|
||
return continueInCurrentState()
|
||
case ">":
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.publicIdentifier = self.currentBuilder.takeString()
|
||
self.currentToken.forceQuirks = true
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
self.currentBuilder.append(currentInputCharacter!)
|
||
return continueInCurrentState()
|
||
}
|
||
// 13.2.5.60 DOCTYPE public identifier (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-single-quoted-state
|
||
case .DOCTYPEPublicIdentifierSingleQuoted:
|
||
switch currentInputCharacter {
|
||
case "'":
|
||
self.currentToken.publicIdentifier = self.currentBuilder.takeString()
|
||
return switchTo(.AfterDOCTYPEPublicIdentifier)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
self.currentBuilder.append("\u{FFFD}")
|
||
return continueInCurrentState()
|
||
case ">":
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.publicIdentifier = self.currentBuilder.takeString()
|
||
self.currentToken.forceQuirks = true
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
self.currentBuilder.append(currentInputCharacter!)
|
||
return continueInCurrentState()
|
||
}
|
||
// 13.2.5.61 After DOCTYPE public identifier state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state
|
||
case .AfterDOCTYPEPublicIdentifier:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
return switchTo(.BetweenDOCTYPEPublicAndSystemIdentifiers)
|
||
case ">":
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case "\"":
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.systemIdentifier = ""
|
||
return switchTo(.DOCTYPESystemIdentifierDoubleQuoted)
|
||
case "'":
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.systemIdentifier = ""
|
||
return switchTo(.DOCTYPESystemIdentifierSingleQuoted)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
|
||
}
|
||
// 13.2.5.62 Between DOCTYPE public and system identifiers state, https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state
|
||
case .BetweenDOCTYPEPublicAndSystemIdentifiers:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
return continueInCurrentState()
|
||
case ">":
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case "\"":
|
||
self.currentToken.systemIdentifier = ""
|
||
return switchTo(.DOCTYPESystemIdentifierDoubleQuoted)
|
||
case "'":
|
||
self.currentToken.systemIdentifier = ""
|
||
return switchTo(.DOCTYPESystemIdentifierSingleQuoted)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
|
||
}
|
||
// 13.2.5.63 After DOCTYPE system keyword state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state
|
||
case .AfterDOCTYPESystemKeyword:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
return switchTo(.BeforeDOCTYPESystemIdentifier)
|
||
case "\"":
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.systemIdentifier = ""
|
||
return switchTo(.DOCTYPESystemIdentifierDoubleQuoted)
|
||
case "'":
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.systemIdentifier = ""
|
||
return switchTo(.DOCTYPESystemIdentifierSingleQuoted)
|
||
case ">":
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
|
||
}
|
||
// 13.2.5.64 Before DOCTYPE system identifier state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state
|
||
case .BeforeDOCTYPESystemIdentifier:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
return continueInCurrentState()
|
||
case "\"":
|
||
self.currentToken.systemIdentifier = ""
|
||
return switchTo(.DOCTYPESystemIdentifierDoubleQuoted)
|
||
case "'":
|
||
self.currentToken.systemIdentifier = ""
|
||
return switchTo(.DOCTYPESystemIdentifierSingleQuoted)
|
||
case ">":
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
|
||
}
|
||
// 13.2.5.65 DOCTYPE system identifier (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-double-quoted-state
|
||
case .DOCTYPESystemIdentifierDoubleQuoted:
|
||
switch currentInputCharacter {
|
||
case "\"":
|
||
self.currentToken.systemIdentifier = self.currentBuilder.takeString()
|
||
return switchTo(.AfterDOCTYPESystemIdentifier)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
self.currentBuilder.append("\u{FFFD}")
|
||
return continueInCurrentState()
|
||
case ">":
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.systemIdentifier = self.currentBuilder.takeString()
|
||
self.currentToken.forceQuirks = true
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
self.currentBuilder.append(currentInputCharacter!)
|
||
return continueInCurrentState()
|
||
}
|
||
// 13.2.5.66 DOCTYPE system identifier (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-single-quoted-state
|
||
case .DOCTYPESystemIdentifierSingleQuoted:
|
||
switch currentInputCharacter {
|
||
case "'":
|
||
return switchTo(.AfterDOCTYPESystemIdentifier)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
self.currentBuilder.append("\u{FFFD}")
|
||
return continueInCurrentState()
|
||
case ">":
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.systemIdentifier = self.currentBuilder.takeString()
|
||
self.currentToken.forceQuirks = true
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
self.currentBuilder.append(currentInputCharacter!)
|
||
return continueInCurrentState()
|
||
}
|
||
// 13.2.5.67 After DOCTYPE system identifier state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state
|
||
case .AfterDOCTYPESystemIdentifier:
|
||
switch currentInputCharacter {
|
||
case "\t", "\n", "\u{000C}", " ":
|
||
return continueInCurrentState()
|
||
case ">":
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
self.currentToken.forceQuirks = true
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
// NOTE: This does not set the current DOCTYPE token's force-quirks flag to on.
|
||
return reconsume(currentInputCharacter!, in: .BogusDOCTYPE)
|
||
}
|
||
// 13.2.5.68 Bogus DOCTYPE state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state
|
||
case .BogusDOCTYPE:
|
||
switch currentInputCharacter {
|
||
case ">":
|
||
return switchToAndEmitCurrentToken(.Data)
|
||
case "\0":
|
||
// FIXME: log_parse_error()
|
||
return continueInCurrentState()
|
||
case nil:
|
||
return emitCurrentTokenFollowedByEOF()
|
||
default:
|
||
return continueInCurrentState()
|
||
}
|
||
// 13.2.5.69 CDATA section state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state
|
||
case .CDATASection:
|
||
switch currentInputCharacter {
|
||
case "]":
|
||
return switchTo(.CDATASectionBracket)
|
||
case nil:
|
||
// FIXME: log_parse_error()
|
||
return emitEOF()
|
||
default:
|
||
// NOTE: U+0000 NULL characters are handled in the tree construction stage,
|
||
// as part of the in foreign content insertion mode, which is the only place where CDATA sections can appear.
|
||
return emitCharacter(currentInputCharacter!)
|
||
}
|
||
// 13.2.5.70 CDATA section bracket state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state
|
||
case .CDATASectionBracket:
|
||
switch currentInputCharacter {
|
||
case "]":
|
||
return switchTo(.CDATASectionEnd)
|
||
default:
|
||
return emitCharacterAndReconsume("]", in: .CDATASection, currentInputCharacter: currentInputCharacter)
|
||
}
|
||
|
||
// 13.2.5.71 CDATA section end state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state
|
||
case .CDATASectionEnd:
|
||
switch currentInputCharacter {
|
||
case "]":
|
||
return emitCharacter("]")
|
||
case ">":
|
||
return switchTo(.Data)
|
||
default:
|
||
queuedTokens.append(HTMLToken(type: .Character(codePoint: "]")))
|
||
return emitCharacterAndReconsume("]", in: .CDATASection, currentInputCharacter: currentInputCharacter)
|
||
}
|
||
// 13.2.5.72 Character reference state, https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
|
||
case .CharacterReference:
|
||
self.temporaryBuffer = "&"
|
||
switch currentInputCharacter {
|
||
case let c? where HTMLTokenizer.asciiAlphanumeric.contains(c.unicodeScalars.first!):
|
||
return reconsume(currentInputCharacter!, in: .NamedCharacterReference)
|
||
case "#":
|
||
self.temporaryBuffer.append(currentInputCharacter!)
|
||
return switchTo(.NumericCharacterReference)
|
||
default:
|
||
self.flushCodepointsConsumedAsACharacterReference()
|
||
return reconsume(currentInputCharacter, in: self.returnState)
|
||
}
|
||
// 13.2.5.73 Named character reference state, https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
|
||
case .NamedCharacterReference:
|
||
// FIXME: This should be updated to match the C++ implementation, this
|
||
// won't handle one-character-at-a-time document.write calls properly.
|
||
var subString = self.input[self.previousCursor...]
|
||
let entityMatch = subString.withUTF8 { utf8 in
|
||
return Web.HTML.match_entity_for_named_character_reference(AK.StringView(utf8.baseAddress!, utf8.count))
|
||
}
|
||
if entityMatch.hasValue {
|
||
let entity = entityMatch.value!.entity
|
||
skip(entity.length())
|
||
// FIXME: Iterate over the entity's code points and add them instead of creating a string
|
||
self.temporaryBuffer.append(Swift.String(akStringView: entity)!)
|
||
|
||
if self.consumedAsPartOfAnAttribute(), !entity.endsWith(";") {
|
||
if let peeked = peekCodePoint(), peeked == "=" || HTMLTokenizer.asciiAlphanumeric.contains(peeked.unicodeScalars.first!) {
|
||
self.flushCodepointsConsumedAsACharacterReference()
|
||
return switchTo(self.returnState)
|
||
}
|
||
}
|
||
|
||
if !entity.endsWith(";") {
|
||
// FIXME: log_parse_error()
|
||
}
|
||
|
||
self.temporaryBuffer = ""
|
||
|
||
// FIXME: This AK::Vector<u32, 2> should be CxxConvertibleToContainer, but https://github.com/swiftlang/swift/issues/77607
|
||
let codePoints = entityMatch.value!.code_points
|
||
for i in 0...codePoints.size() {
|
||
self.temporaryBuffer.append(Character(Unicode.Scalar(codePoints[i])!))
|
||
}
|
||
self.flushCodepointsConsumedAsACharacterReference()
|
||
return switchTo(self.returnState)
|
||
}
|
||
|
||
self.flushCodepointsConsumedAsACharacterReference()
|
||
return reconsume(currentInputCharacter, in: .AmbiguousAmpersand)
|
||
// 13.2.5.74 Ambiguous ampersand state, https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state
|
||
case .AmbiguousAmpersand:
|
||
switch currentInputCharacter {
|
||
case let c? where HTMLTokenizer.asciiAlphanumeric.contains(c.unicodeScalars.first!):
|
||
if self.consumedAsPartOfAnAttribute() {
|
||
self.currentBuilder.append(currentInputCharacter!)
|
||
return continueInCurrentState()
|
||
}
|
||
return emitCharacter(currentInputCharacter!)
|
||
case ";":
|
||
// FIXME: log_parse_error()
|
||
return reconsume(currentInputCharacter!, in: self.returnState)
|
||
default:
|
||
return reconsume(currentInputCharacter, in: self.returnState)
|
||
}
|
||
// 13.2.5.75 Numeric character reference state, https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state
|
||
case .NumericCharacterReference:
|
||
self.characterReferenceCode = 0
|
||
switch currentInputCharacter {
|
||
case "x", "X":
|
||
self.temporaryBuffer.append(currentInputCharacter!)
|
||
return switchTo(.HexadecimalCharacterReferenceStart)
|
||
default:
|
||
return reconsume(currentInputCharacter, in: .DecimalCharacterReferenceStart)
|
||
}
|
||
// 13.2.5.76 Hexadecimal character reference start state, https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state
|
||
case .HexadecimalCharacterReferenceStart:
|
||
switch currentInputCharacter {
|
||
case let c? where HTMLTokenizer.asciiHexDigit.contains(c.unicodeScalars.first!):
|
||
return reconsume(currentInputCharacter!, in: .HexadecimalCharacterReference)
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
self.flushCodepointsConsumedAsACharacterReference()
|
||
return reconsume(currentInputCharacter, in: self.returnState)
|
||
}
|
||
// 13.2.5.77 Decimal character reference start state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state
|
||
case .DecimalCharacterReferenceStart:
|
||
switch currentInputCharacter {
|
||
case let c? where HTMLTokenizer.asciiDigit.contains(c.unicodeScalars.first!):
|
||
return reconsume(currentInputCharacter!, in: .DecimalCharacterReference)
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
self.flushCodepointsConsumedAsACharacterReference()
|
||
return reconsume(currentInputCharacter, in: self.returnState)
|
||
}
|
||
// 13.2.5.78 Hexadecimal character reference state, https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-state
|
||
case .HexadecimalCharacterReference:
|
||
switch currentInputCharacter {
|
||
case let c? where HTMLTokenizer.asciiDigit.contains(c.unicodeScalars.first!):
|
||
self.characterReferenceCode = self.characterReferenceCode * 16 + Int(c.asciiValue! - 0x30)
|
||
return continueInCurrentState()
|
||
case let c? where HTMLTokenizer.asciiUpperHexDigit.contains(c.unicodeScalars.first!):
|
||
self.characterReferenceCode = self.characterReferenceCode * 16 + Int(c.asciiValue! - 0x37)
|
||
return continueInCurrentState()
|
||
case let c? where HTMLTokenizer.asciiLowerHexDigit.contains(c.unicodeScalars.first!):
|
||
self.characterReferenceCode = self.characterReferenceCode * 16 + Int(c.asciiValue! - 0x57)
|
||
return continueInCurrentState()
|
||
case ";":
|
||
return switchTo(.NumericCharacterReferenceEnd)
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
return reconsume(currentInputCharacter, in: .NumericCharacterReferenceEnd)
|
||
}
|
||
|
||
// 13.2.5.79 Decimal character reference state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state
|
||
case .DecimalCharacterReference:
|
||
switch currentInputCharacter {
|
||
case let c? where HTMLTokenizer.asciiDigit.contains(c.unicodeScalars.first!):
|
||
self.characterReferenceCode = self.characterReferenceCode * 10 + Int(c.asciiValue! - 0x30)
|
||
return continueInCurrentState()
|
||
case ";":
|
||
return switchTo(.NumericCharacterReferenceEnd)
|
||
default:
|
||
// FIXME: log_parse_error()
|
||
return reconsume(currentInputCharacter, in: .NumericCharacterReferenceEnd)
|
||
}
|
||
|
||
// 13.2.5.80 Numeric character reference end state, https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
|
||
case .NumericCharacterReferenceEnd:
|
||
dontConsumeNextInputCharacter()
|
||
let codePoint: UnicodeScalar =
|
||
switch self.characterReferenceCode {
|
||
case 0x00:
|
||
// FIXME: log_parse_error()
|
||
UnicodeScalar(0xFFFD)!
|
||
case let c where c > 0x10FFFF:
|
||
// FIXME: log_parse_error()
|
||
UnicodeScalar(0xFFFD)!
|
||
case let c where UTF16.CodeUnit(exactly: c).map({ UTF16.isSurrogate($0) }) != nil:
|
||
// FIXME: log_parse_error()
|
||
UnicodeScalar(0xFFFD)!
|
||
case let c where UnicodeScalar(c)! == "\u{000D}" || HTMLTokenizer.controlNotAsciiWhitespace.contains(UnicodeScalar(c)!):
|
||
// FIXME: log_parse_error()
|
||
if let codePoint = HTMLTokenizer.characterReferenceControlCodeMapping[c] {
|
||
codePoint
|
||
} else {
|
||
UnicodeScalar(c)!
|
||
}
|
||
case let c where HTMLTokenizer.nonCharacter.contains(UnicodeScalar(c)!):
|
||
// FIXME: log_parse_error()
|
||
UnicodeScalar(c)!
|
||
default:
|
||
UnicodeScalar(self.characterReferenceCode)!
|
||
}
|
||
|
||
self.temporaryBuffer = Swift.String(Character(codePoint))
|
||
self.flushCodepointsConsumedAsACharacterReference()
|
||
return switchTo(self.returnState)
|
||
}
|
||
}
|
||
}
|