diff --git a/Libraries/LibWeb/HTML/Parser/HTMLToken.swift b/Libraries/LibWeb/HTML/Parser/HTMLToken.swift index bdecd8b8eb5..ec7501cfea1 100644 --- a/Libraries/LibWeb/HTML/Parser/HTMLToken.swift +++ b/Libraries/LibWeb/HTML/Parser/HTMLToken.swift @@ -163,6 +163,81 @@ public class HTMLToken { } } } + public var name: Swift.String? { + get { + switch self.type { + case .DOCTYPE(let name, _, _, _): + return name + default: + preconditionFailure("doctypeName called on non-doctype token") + } + } + set { + switch self.type { + case .DOCTYPE(_, let publicIdentifier, let systemIdentifier, let forceQuirksMode): + self.type = .DOCTYPE(name: newValue, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier, forceQuirksMode: forceQuirksMode) + default: + preconditionFailure("doctypeName= called on non-doctype token") + } + } + } + + public var forceQuirks: Bool { + get { + switch self.type { + case .DOCTYPE(_, _, _, let forceQuirksMode): + return forceQuirksMode + default: + preconditionFailure("forceQuirks called on non-doctype token") + } + } + set { + switch self.type { + case .DOCTYPE(let name, let publicIdentifier, let systemIdentifier, _): + self.type = .DOCTYPE(name: name, publicIdentifier: publicIdentifier, systemIdentifier: systemIdentifier, forceQuirksMode: newValue) + default: + preconditionFailure("forceQuirks= called on non-doctype token") + } + } + } + + public var publicIdentifier: Swift.String? { + get { + switch self.type { + case .DOCTYPE(_, let publicIdentifier, _, _): + return publicIdentifier + default: + preconditionFailure("publicIdentifier called on non-doctype token") + } + } + set { + switch self.type { + case .DOCTYPE(let name, _, let systemIdentifier, let forceQuirksMode): + self.type = .DOCTYPE(name: name, publicIdentifier: newValue, systemIdentifier: systemIdentifier, forceQuirksMode: forceQuirksMode) + default: + preconditionFailure("publicIdentifier= called on non-doctype token") + } + } + } + + public var systemIdentifier: Swift.String? { + get { + switch self.type { + case .DOCTYPE(_, _, let systemIdentifier, _): + return systemIdentifier + default: + preconditionFailure("systemIdentifier called on non-doctype token") + } + } + set { + switch self.type { + case .DOCTYPE(let name, let publicIdentifier, _, let forceQuirksMode): + self.type = .DOCTYPE(name: name, publicIdentifier: publicIdentifier, systemIdentifier: newValue, forceQuirksMode: forceQuirksMode) + default: + preconditionFailure("systemIdentifier= called on non-doctype token") + } + } + } public init() {} public init(type: TokenType) { diff --git a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift index 9ef32a532ef..b3ead8c20f2 100644 --- a/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift +++ b/Libraries/LibWeb/HTML/Parser/HTMLTokenizer.swift @@ -126,10 +126,60 @@ public class HTMLTokenizer { private var lastStartTagName: Swift.String? = nil private var currentTokensAttributes: [HTMLToken.Attribute]? = nil private var currentAttribute: HTMLToken.Attribute? = nil + private var characterReferenceCode: Int = 0 private var aborted = false private var hasEmittedEOF = false + // https://infra.spec.whatwg.org/#noncharacter + // A noncharacter is a code point that is in the range U+FDD0 to U+FDEF, inclusive, + // or U+FFFE, U+FFFF, U+1FFFE, U+1FFFF, U+2FFFE, U+2FFFF, U+3FFFE, U+3FFFF, U+4FFFE, U+4FFFF, + // U+5FFFE, U+5FFFF, U+6FFFE, U+6FFFF, U+7FFFE, U+7FFFF, U+8FFFE, U+8FFFF, U+9FFFE, U+9FFFF, + // U+AFFFE, U+AFFFF, U+BFFFE, U+BFFFF, U+CFFFE, U+CFFFF, U+DFFFE, U+DFFFF, U+EFFFE, U+EFFFF, + // U+FFFFE, U+FFFFF, U+10FFFE, or U+10FFFF. + static private var nonCharacter = CharacterSet(charactersIn: Unicode.Scalar(0xFDD0)!...Unicode.Scalar(0xFDEF)!) + .union(CharacterSet(charactersIn: "\u{FFFE}"..."\u{FFFF}")) + .union(CharacterSet(charactersIn: "\u{1FFFE}"..."\u{1FFFF}")) + .union(CharacterSet(charactersIn: "\u{2FFFE}"..."\u{2FFFF}")) + .union(CharacterSet(charactersIn: "\u{3FFFE}"..."\u{3FFFF}")) + .union(CharacterSet(charactersIn: "\u{4FFFE}"..."\u{4FFFF}")) + .union(CharacterSet(charactersIn: "\u{5FFFE}"..."\u{5FFFF}")) + .union(CharacterSet(charactersIn: "\u{6FFFE}"..."\u{6FFFF}")) + .union(CharacterSet(charactersIn: "\u{7FFFE}"..."\u{7FFFF}")) + .union(CharacterSet(charactersIn: "\u{8FFFE}"..."\u{8FFFF}")) + .union(CharacterSet(charactersIn: "\u{9FFFE}"..."\u{9FFFF}")) + .union(CharacterSet(charactersIn: "\u{AFFFE}"..."\u{AFFFF}")) + .union(CharacterSet(charactersIn: "\u{BFFFE}"..."\u{BFFFF}")) + .union(CharacterSet(charactersIn: "\u{CFFFE}"..."\u{CFFFF}")) + .union(CharacterSet(charactersIn: "\u{DFFFE}"..."\u{DFFFF}")) + .union(CharacterSet(charactersIn: "\u{EFFFE}"..."\u{EFFFF}")) + .union(CharacterSet(charactersIn: "\u{FFFFE}"..."\u{FFFFF}")) + .union(CharacterSet(charactersIn: "\u{10FFFE}"..."\u{10FFFF}")) + + // https://infra.spec.whatwg.org/#ascii-whitespace + static private var asciiWhitespace = CharacterSet(charactersIn: "\t\n\u{000C}\u{000D} ") + + // https://infra.spec.whatwg.org/#c0-control + static private var c0Control = CharacterSet(charactersIn: "\u{0000}"..."\u{001F}") + + // https://infra.spec.whatwg.org/#control + static private var control = c0Control.union(CharacterSet(charactersIn: "\u{007F}"..."\u{009F}")) + + // IMPLEMENTATION DEFINED: Used for the numeric character reference end state + static private var controlNotAsciiWhitespace = control.subtracting(asciiWhitespace) + + // https://infra.spec.whatwg.org/#ascii-digit + static private var asciiDigit = CharacterSet(charactersIn: "0123456789") + + // https://infra.spec.whatwg.org/#ascii-upper-hex-digit + static private var asciiUpperHexDigit = CharacterSet(charactersIn: "ABCDEF") + + // https://infra.spec.whatwg.org/#ascii-lower-hex-digit + static private var asciiLowerHexDigit = CharacterSet(charactersIn: "abcdef") + + // https://infra.spec.whatwg.org/#ascii-hex-digit + static private var asciiHexDigit = asciiUpperHexDigit.union(asciiLowerHexDigit) + // https://infra.spec.whatwg.org/#ascii-upper-alpha static private var asciiUpperAlpha = CharacterSet(charactersIn: "ABCDEFGHIJKLMNOPQRSTUVWXYZ") @@ -139,6 +189,40 @@ public class HTMLTokenizer { // https://infra.spec.whatwg.org/#ascii-upper-alpha static private var asciiAlpha = asciiUpperAlpha.union(asciiLowerAlpha) + // https://infra.spec.whatwg.org/#ascii-alphanumeric + static private var asciiAlphanumeric = asciiAlpha.union(asciiDigit) + + static private var characterReferenceControlCodeMapping: [Int: Unicode.Scalar] = + [ + 0x80: Unicode.Scalar(0x20AC)!, // € + 0x82: Unicode.Scalar(0x201A)!, // ‚ + 0x83: Unicode.Scalar(0x0192)!, // ƒ + 0x84: Unicode.Scalar(0x201E)!, // „ + 0x85: Unicode.Scalar(0x2026)!, // … + 0x86: Unicode.Scalar(0x2020)!, // † + 0x87: Unicode.Scalar(0x2021)!, // ‡ + 0x88: Unicode.Scalar(0x02C6)!, // ˆ + 0x89: Unicode.Scalar(0x2030)!, // ‰ + 0x8A: Unicode.Scalar(0x0160)!, // Š + 0x8B: Unicode.Scalar(0x2039)!, // ‹ + 0x8C: Unicode.Scalar(0x0152)!, // Œ + 0x8E: Unicode.Scalar(0x017D)!, // Ž + 0x91: Unicode.Scalar(0x2018)!, // ‘ + 0x92: Unicode.Scalar(0x2019)!, // ’ + 0x93: Unicode.Scalar(0x201C)!, // “ + 0x94: Unicode.Scalar(0x201D)!, // ” + 0x95: Unicode.Scalar(0x2022)!, // • + 0x96: Unicode.Scalar(0x2013)!, // – + 0x97: Unicode.Scalar(0x2014)!, // — + 0x98: Unicode.Scalar(0x02DC)!, // ˜ + 0x99: Unicode.Scalar(0x2122)!, // ™ + 0x9A: Unicode.Scalar(0x0161)!, // š + 0x9B: Unicode.Scalar(0x203A)!, // › + 0x9C: Unicode.Scalar(0x0153)!, // œ + 0x9E: Unicode.Scalar(0x017E)!, // ž + 0x9F: Unicode.Scalar(0x0178)!, // Ÿ + ] + public init() { self.cursor = self.input.startIndex self.previousCursor = self.input.startIndex @@ -162,8 +246,8 @@ public class HTMLTokenizer { } func skip(_ count: Int) { + self.previousCursor = self.cursor self.cursor = self.input.index(self.cursor, offsetBy: count, limitedBy: self.input.endIndex) ?? input.endIndex - self.previousCursor = self.input.index(before: self.cursor) } func peekCodePoint(_ offset: Int = 0) -> Character? { @@ -579,6 +663,65 @@ public class HTMLTokenizer { self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint))) } return reconsume(currentInputCharacter, in: .RCDATA) + // 13.2.5.12 RAWTEXT less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-less-than-sign-state + case .RAWTEXTLessThanSign: + switch currentInputCharacter { + case "/": + self.temporaryBuffer = "" + return switchTo(.RAWTEXTEndTagOpen) + default: + return emitCharacterAndReconsume("<", in: .RAWTEXT, currentInputCharacter: currentInputCharacter) + } + // 13.2.5.13 RAWTEXT end tag open state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-open-state + case .RAWTEXTEndTagOpen: + switch currentInputCharacter { + case let c? where HTMLTokenizer.asciiAlpha.contains(c.unicodeScalars.first!): + createNewToken(HTMLToken(type: .EndTag(tagName: ""))) + return reconsume(currentInputCharacter!, in: .RAWTEXTEndTagName) + default: + queuedTokens.append(HTMLToken(type: .Character(codePoint: "<"))) + queuedTokens.append(HTMLToken(type: .Character(codePoint: "/"))) + return reconsume(currentInputCharacter, in: .RAWTEXT) + } + // 13.2.5.14 RAWTEXT end tag name state, https://html.spec.whatwg.org/multipage/parsing.html#rawtext-end-tag-name-state + case .RAWTEXTEndTagName: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + if self.isAppropriateEndTagToken(currentToken) { + return switchTo(.BeforeAttributeName) + } + break + case "/": + if self.isAppropriateEndTagToken(currentToken) { + return switchTo(.SelfClosingStartTag) + } + break + case ">": + if self.isAppropriateEndTagToken(currentToken) { + return switchToAndEmitCurrentToken(.Data) + } + break + case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!): + self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20))) + self.temporaryBuffer.append(c) + return continueInCurrentState() + case let c? where HTMLTokenizer.asciiLowerAlpha.contains(c.unicodeScalars.first!): + self.currentBuilder.append(c) + self.temporaryBuffer.append(c) + return continueInCurrentState() + default: + break + } + + // First three steps fall through to the "anything else" block + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "<"))) + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: "/"))) + // NOTE: The spec doesn't mention this, but it seems that m_current_token (an end tag) is just dropped in this case. + self.currentBuilder = "" + for codePoint in self.temporaryBuffer { + self.queuedTokens.append(HTMLToken(type: .Character(codePoint: codePoint))) + } + return reconsume(currentInputCharacter, in: .RAWTEXT) // 13.2.5.15 Script data less-than sign state, https://html.spec.whatwg.org/multipage/parsing.html#script-data-less-than-sign-state case .ScriptDataLessThanSign: switch currentInputCharacter { @@ -1225,9 +1368,559 @@ public class HTMLTokenizer { currentBuilder.append("--!") return reconsume(currentInputCharacter, in: .Comment) } - default: - print("TODO: In state \(self.state) with input \(Swift.String(describing: currentInputCharacter))") - return emitEOF() + // 13.2.5.53 DOCTYPE state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-state + case .DOCTYPE: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + return switchTo(.BeforeDOCTYPEName) + case ">": + return reconsume(currentInputCharacter, in: .BeforeDOCTYPEName) + case nil: + // FIXME: log_parse_error() + currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: true)) + return emitCurrentTokenFollowedByEOF() + default: + // FIXME: log_parse_error() + return reconsume(currentInputCharacter!, in: .BeforeDOCTYPEName) + } + // 13.2.5.54 Before DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state + case .BeforeDOCTYPEName: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + return continueInCurrentState() + case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!): + precondition(self.currentBuilder.isEmpty) + self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: false)) + self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20))) + return switchTo(.DOCTYPEName) + case "\0": + // FIXME: log_parse_error() + precondition(self.currentBuilder.isEmpty) + self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: false)) + self.currentBuilder.append("\u{FFFD}") + return switchTo(.DOCTYPEName) + case ">": + // FIXME: log_parse_error() + self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: true)) + return switchToAndEmitCurrentToken(.Data) + case nil: + // FIXME: log_parse_error() + self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: true)) + return emitCurrentTokenFollowedByEOF() + default: + // FIXME: log_parse_error() + precondition(self.currentBuilder.isEmpty) + self.currentToken = HTMLToken(type: .DOCTYPE(name: nil, publicIdentifier: nil, systemIdentifier: nil, forceQuirksMode: false)) + self.currentBuilder.append(currentInputCharacter!) + return switchTo(.DOCTYPEName) + } + // 13.2.5.55 DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state + case .DOCTYPEName: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + self.currentToken.name = self.currentBuilder.takeString() + return switchTo(.AfterDOCTYPEName) + case ">": + self.currentToken.name = self.currentBuilder.takeString() + return switchToAndEmitCurrentToken(.Data) + case let c? where HTMLTokenizer.asciiUpperAlpha.contains(c.unicodeScalars.first!): + self.currentBuilder.append(Character(Unicode.Scalar(c.asciiValue! + 0x20))) + return continueInCurrentState() + case "\0": + // FIXME: log_parse_error() + self.currentBuilder.append("\u{FFFD}") + return continueInCurrentState() + case nil: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + self.currentToken.name = self.currentBuilder.takeString() + return emitCurrentTokenFollowedByEOF() + default: + self.currentBuilder.append(currentInputCharacter!) + return continueInCurrentState() + } + // 13.2.5.56 After DOCTYPE name state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-name-state + case .AfterDOCTYPEName: + precondition(self.currentBuilder.isEmpty) + precondition(self.currentToken.name != nil) + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + return continueInCurrentState() + case ">": + return switchToAndEmitCurrentToken(.Data) + case nil: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return emitCurrentTokenFollowedByEOF() + default: + if "pP".contains(currentInputCharacter!), peekNext(count: 5)?.uppercased() == "UBLIC" { + skip(5) + return switchTo(.AfterDOCTYPEPublicKeyword) + } + if "sS".contains(currentInputCharacter!), peekNext(count: 5)?.uppercased() == "YSTEM" { + skip(5) + return switchTo(.AfterDOCTYPESystemKeyword) + } + + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return reconsume(currentInputCharacter!, in: .BogusDOCTYPE) + } + // 13.2.5.57 After DOCTYPE public keyword state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-keyword-state + case .AfterDOCTYPEPublicKeyword: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + return switchTo(.BeforeDOCTYPEPublicIdentifier) + case "\"": + // FIXME: log_parse_error() + self.currentToken.publicIdentifier = "" + return switchTo(.DOCTYPEPublicIdentifierDoubleQuoted) + case "'": + self.currentToken.publicIdentifier = "" + return switchTo(.DOCTYPEPublicIdentifierSingleQuoted) + case ">": + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return switchToAndEmitCurrentToken(.Data) + case nil: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return emitCurrentTokenFollowedByEOF() + default: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return reconsume(currentInputCharacter!, in: .BogusDOCTYPE) + } + // 13.2.5.58 Before DOCTYPE public identifier state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-public-identifier-state + case .BeforeDOCTYPEPublicIdentifier: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + return continueInCurrentState() + case "\"": + self.currentToken.publicIdentifier = "" + return switchTo(.DOCTYPEPublicIdentifierDoubleQuoted) + case "'": + self.currentToken.publicIdentifier = "" + return switchTo(.DOCTYPEPublicIdentifierSingleQuoted) + case ">": + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return switchToAndEmitCurrentToken(.Data) + case nil: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return emitCurrentTokenFollowedByEOF() + default: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return reconsume(currentInputCharacter!, in: .BogusDOCTYPE) + } + // 13.2.5.59 DOCTYPE public identifier (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-double-quoted-state + case .DOCTYPEPublicIdentifierDoubleQuoted: + switch currentInputCharacter { + case "\"": + self.currentToken.publicIdentifier = self.currentBuilder.takeString() + return switchTo(.AfterDOCTYPEPublicIdentifier) + case "\0": + // FIXME: log_parse_error() + self.currentBuilder.append("\u{FFFD}") + return continueInCurrentState() + case ">": + // FIXME: log_parse_error() + self.currentToken.publicIdentifier = self.currentBuilder.takeString() + self.currentToken.forceQuirks = true + return switchToAndEmitCurrentToken(.Data) + case nil: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return emitCurrentTokenFollowedByEOF() + default: + self.currentBuilder.append(currentInputCharacter!) + return continueInCurrentState() + } + // 13.2.5.60 DOCTYPE public identifier (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-public-identifier-single-quoted-state + case .DOCTYPEPublicIdentifierSingleQuoted: + switch currentInputCharacter { + case "'": + self.currentToken.publicIdentifier = self.currentBuilder.takeString() + return switchTo(.AfterDOCTYPEPublicIdentifier) + case "\0": + // FIXME: log_parse_error() + self.currentBuilder.append("\u{FFFD}") + return continueInCurrentState() + case ">": + // FIXME: log_parse_error() + self.currentToken.publicIdentifier = self.currentBuilder.takeString() + self.currentToken.forceQuirks = true + return switchToAndEmitCurrentToken(.Data) + case nil: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return emitCurrentTokenFollowedByEOF() + default: + self.currentBuilder.append(currentInputCharacter!) + return continueInCurrentState() + } + // 13.2.5.61 After DOCTYPE public identifier state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-public-identifier-state + case .AfterDOCTYPEPublicIdentifier: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + return switchTo(.BetweenDOCTYPEPublicAndSystemIdentifiers) + case ">": + return switchToAndEmitCurrentToken(.Data) + case "\"": + // FIXME: log_parse_error() + self.currentToken.systemIdentifier = "" + return switchTo(.DOCTYPESystemIdentifierDoubleQuoted) + case "'": + // FIXME: log_parse_error() + self.currentToken.systemIdentifier = "" + return switchTo(.DOCTYPESystemIdentifierSingleQuoted) + case nil: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return emitCurrentTokenFollowedByEOF() + default: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return reconsume(currentInputCharacter!, in: .BogusDOCTYPE) + } + // 13.2.5.62 Between DOCTYPE public and system identifiers state, https://html.spec.whatwg.org/multipage/parsing.html#between-doctype-public-and-system-identifiers-state + case .BetweenDOCTYPEPublicAndSystemIdentifiers: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + return continueInCurrentState() + case ">": + return switchToAndEmitCurrentToken(.Data) + case "\"": + self.currentToken.systemIdentifier = "" + return switchTo(.DOCTYPESystemIdentifierDoubleQuoted) + case "'": + self.currentToken.systemIdentifier = "" + return switchTo(.DOCTYPESystemIdentifierSingleQuoted) + case nil: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return emitCurrentTokenFollowedByEOF() + default: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return reconsume(currentInputCharacter!, in: .BogusDOCTYPE) + } + // 13.2.5.63 After DOCTYPE system keyword state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-keyword-state + case .AfterDOCTYPESystemKeyword: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + return switchTo(.BeforeDOCTYPESystemIdentifier) + case "\"": + // FIXME: log_parse_error() + self.currentToken.systemIdentifier = "" + return switchTo(.DOCTYPESystemIdentifierDoubleQuoted) + case "'": + // FIXME: log_parse_error() + self.currentToken.systemIdentifier = "" + return switchTo(.DOCTYPESystemIdentifierSingleQuoted) + case ">": + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return switchToAndEmitCurrentToken(.Data) + case nil: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return emitCurrentTokenFollowedByEOF() + default: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return reconsume(currentInputCharacter!, in: .BogusDOCTYPE) + } + // 13.2.5.64 Before DOCTYPE system identifier state, https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-system-identifier-state + case .BeforeDOCTYPESystemIdentifier: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + return continueInCurrentState() + case "\"": + self.currentToken.systemIdentifier = "" + return switchTo(.DOCTYPESystemIdentifierDoubleQuoted) + case "'": + self.currentToken.systemIdentifier = "" + return switchTo(.DOCTYPESystemIdentifierSingleQuoted) + case ">": + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return switchToAndEmitCurrentToken(.Data) + case nil: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return emitCurrentTokenFollowedByEOF() + default: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return reconsume(currentInputCharacter!, in: .BogusDOCTYPE) + } + // 13.2.5.65 DOCTYPE system identifier (double-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-double-quoted-state + case .DOCTYPESystemIdentifierDoubleQuoted: + switch currentInputCharacter { + case "\"": + self.currentToken.systemIdentifier = self.currentBuilder.takeString() + return switchTo(.AfterDOCTYPESystemIdentifier) + case "\0": + // FIXME: log_parse_error() + self.currentBuilder.append("\u{FFFD}") + return continueInCurrentState() + case ">": + // FIXME: log_parse_error() + self.currentToken.systemIdentifier = self.currentBuilder.takeString() + self.currentToken.forceQuirks = true + return switchToAndEmitCurrentToken(.Data) + case nil: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return emitCurrentTokenFollowedByEOF() + default: + self.currentBuilder.append(currentInputCharacter!) + return continueInCurrentState() + } + // 13.2.5.66 DOCTYPE system identifier (single-quoted) state, https://html.spec.whatwg.org/multipage/parsing.html#doctype-system-identifier-single-quoted-state + case .DOCTYPESystemIdentifierSingleQuoted: + switch currentInputCharacter { + case "'": + return switchTo(.AfterDOCTYPESystemIdentifier) + case "\0": + // FIXME: log_parse_error() + self.currentBuilder.append("\u{FFFD}") + return continueInCurrentState() + case ">": + // FIXME: log_parse_error() + self.currentToken.systemIdentifier = self.currentBuilder.takeString() + self.currentToken.forceQuirks = true + return switchToAndEmitCurrentToken(.Data) + case nil: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return emitCurrentTokenFollowedByEOF() + default: + self.currentBuilder.append(currentInputCharacter!) + return continueInCurrentState() + } + // 13.2.5.67 After DOCTYPE system identifier state, https://html.spec.whatwg.org/multipage/parsing.html#after-doctype-system-identifier-state + case .AfterDOCTYPESystemIdentifier: + switch currentInputCharacter { + case "\t", "\n", "\u{000C}", " ": + return continueInCurrentState() + case ">": + return switchToAndEmitCurrentToken(.Data) + case nil: + // FIXME: log_parse_error() + self.currentToken.forceQuirks = true + return emitCurrentTokenFollowedByEOF() + default: + // FIXME: log_parse_error() + // NOTE: This does not set the current DOCTYPE token's force-quirks flag to on. + return reconsume(currentInputCharacter!, in: .BogusDOCTYPE) + } + // 13.2.5.68 Bogus DOCTYPE state, https://html.spec.whatwg.org/multipage/parsing.html#bogus-doctype-state + case .BogusDOCTYPE: + switch currentInputCharacter { + case ">": + return switchToAndEmitCurrentToken(.Data) + case "\0": + // FIXME: log_parse_error() + return continueInCurrentState() + case nil: + return emitCurrentTokenFollowedByEOF() + default: + return continueInCurrentState() + } + // 13.2.5.69 CDATA section state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-state + case .CDATASection: + switch currentInputCharacter { + case "]": + return switchTo(.CDATASectionBracket) + case nil: + // FIXME: log_parse_error() + return emitEOF() + default: + // NOTE: U+0000 NULL characters are handled in the tree construction stage, + // as part of the in foreign content insertion mode, which is the only place where CDATA sections can appear. + return emitCharacter(currentInputCharacter!) + } + // 13.2.5.70 CDATA section bracket state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-bracket-state + case .CDATASectionBracket: + switch currentInputCharacter { + case "]": + return switchTo(.CDATASectionEnd) + default: + return emitCharacterAndReconsume("]", in: .CDATASection, currentInputCharacter: currentInputCharacter) + } + + // 13.2.5.71 CDATA section end state, https://html.spec.whatwg.org/multipage/parsing.html#cdata-section-end-state + case .CDATASectionEnd: + switch currentInputCharacter { + case "]": + return emitCharacter("]") + case ">": + return switchTo(.Data) + default: + queuedTokens.append(HTMLToken(type: .Character(codePoint: "]"))) + return emitCharacterAndReconsume("]", in: .CDATASection, currentInputCharacter: currentInputCharacter) + } + // 13.2.5.72 Character reference state, https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state + case .CharacterReference: + self.temporaryBuffer = "&" + switch currentInputCharacter { + case let c? where HTMLTokenizer.asciiAlphanumeric.contains(c.unicodeScalars.first!): + return reconsume(currentInputCharacter!, in: .NamedCharacterReference) + case "#": + self.temporaryBuffer.append(currentInputCharacter!) + return switchTo(.NumericCharacterReference) + default: + self.flushCodepointsConsumedAsACharacterReference() + return reconsume(currentInputCharacter, in: self.returnState) + } + // 13.2.5.73 Named character reference state, https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state + case .NamedCharacterReference: + var subString = self.input[self.previousCursor...] + let entityMatch = subString.withUTF8 { utf8 in + return Web.HTML.match_entity_for_named_character_reference(AK.StringView(utf8.baseAddress!, utf8.count)) + } + if entityMatch.hasValue { + let entity = entityMatch.value!.entity + skip(entity.length()) + // FIXME: Iterate over the entity's code points and add them instead of creating a string + self.temporaryBuffer.append(Swift.String(akStringView: entity)!) + + if self.consumedAsPartOfAnAttribute(), !entity.endsWith(";") { + if let peeked = peekCodePoint(), peeked == "=" || HTMLTokenizer.asciiAlphanumeric.contains(peeked.unicodeScalars.first!) { + self.flushCodepointsConsumedAsACharacterReference() + return switchTo(self.returnState) + } + } + + if !entity.endsWith(";") { + // FIXME: log_parse_error() + } + + self.temporaryBuffer = "" + + // FIXME: This AK::Vector should be CxxConvertibleToContainer, but https://github.com/swiftlang/swift/issues/77607 + let codePoints = entityMatch.value!.code_points + for i in 0...codePoints.size() { + self.temporaryBuffer.append(Character(Unicode.Scalar(codePoints[i])!)) + } + self.flushCodepointsConsumedAsACharacterReference() + return switchTo(self.returnState) + } + + self.flushCodepointsConsumedAsACharacterReference() + return reconsume(currentInputCharacter, in: .AmbiguousAmpersand) + // 13.2.5.74 Ambiguous ampersand state, https://html.spec.whatwg.org/multipage/parsing.html#ambiguous-ampersand-state + case .AmbiguousAmpersand: + switch currentInputCharacter { + case let c? where HTMLTokenizer.asciiAlphanumeric.contains(c.unicodeScalars.first!): + if self.consumedAsPartOfAnAttribute() { + self.currentBuilder.append(currentInputCharacter!) + return continueInCurrentState() + } + return emitCharacter(currentInputCharacter!) + case ";": + // FIXME: log_parse_error() + return reconsume(currentInputCharacter!, in: self.returnState) + default: + return reconsume(currentInputCharacter, in: self.returnState) + } + // 13.2.5.75 Numeric character reference state, https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-state + case .NumericCharacterReference: + self.characterReferenceCode = 0 + switch currentInputCharacter { + case "x", "X": + self.temporaryBuffer.append(currentInputCharacter!) + return switchTo(.HexadecimalCharacterReferenceStart) + default: + return reconsume(currentInputCharacter, in: .DecimalCharacterReferenceStart) + } + // 13.2.5.76 Hexadecimal character reference start state, https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-start-state + case .HexadecimalCharacterReferenceStart: + switch currentInputCharacter { + case let c? where HTMLTokenizer.asciiHexDigit.contains(c.unicodeScalars.first!): + return reconsume(currentInputCharacter!, in: .HexadecimalCharacterReference) + default: + // FIXME: log_parse_error() + self.flushCodepointsConsumedAsACharacterReference() + return reconsume(currentInputCharacter, in: self.returnState) + } + // 13.2.5.77 Decimal character reference start state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-start-state + case .DecimalCharacterReferenceStart: + switch currentInputCharacter { + case let c? where HTMLTokenizer.asciiDigit.contains(c.unicodeScalars.first!): + return reconsume(currentInputCharacter!, in: .DecimalCharacterReference) + default: + // FIXME: log_parse_error() + self.flushCodepointsConsumedAsACharacterReference() + return reconsume(currentInputCharacter, in: self.returnState) + } + // 13.2.5.78 Hexadecimal character reference state, https://html.spec.whatwg.org/multipage/parsing.html#hexadecimal-character-reference-state + case .HexadecimalCharacterReference: + switch currentInputCharacter { + case let c? where HTMLTokenizer.asciiDigit.contains(c.unicodeScalars.first!): + self.characterReferenceCode = self.characterReferenceCode * 16 + Int(c.asciiValue! - 0x30) + return continueInCurrentState() + case let c? where HTMLTokenizer.asciiUpperHexDigit.contains(c.unicodeScalars.first!): + self.characterReferenceCode = self.characterReferenceCode * 16 + Int(c.asciiValue! - 0x37) + return continueInCurrentState() + case let c? where HTMLTokenizer.asciiLowerHexDigit.contains(c.unicodeScalars.first!): + self.characterReferenceCode = self.characterReferenceCode * 16 + Int(c.asciiValue! - 0x57) + return continueInCurrentState() + case ";": + return switchTo(.NumericCharacterReferenceEnd) + default: + // FIXME: log_parse_error() + return reconsume(currentInputCharacter, in: .NumericCharacterReferenceEnd) + } + + // 13.2.5.79 Decimal character reference state, https://html.spec.whatwg.org/multipage/parsing.html#decimal-character-reference-state + case .DecimalCharacterReference: + switch currentInputCharacter { + case let c? where HTMLTokenizer.asciiDigit.contains(c.unicodeScalars.first!): + self.characterReferenceCode = self.characterReferenceCode * 10 + Int(c.asciiValue! - 0x30) + return continueInCurrentState() + case ";": + return switchTo(.NumericCharacterReferenceEnd) + default: + // FIXME: log_parse_error() + return reconsume(currentInputCharacter, in: .NumericCharacterReferenceEnd) + } + + // 13.2.5.80 Numeric character reference end state, https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state + case .NumericCharacterReferenceEnd: + dontConsumeNextInputCharacter() + let codePoint: UnicodeScalar = + switch self.characterReferenceCode { + case 0x00: + // FIXME: log_parse_error() + UnicodeScalar(0xFFFD)! + case let c where c > 0x10FFFF: + // FIXME: log_parse_error() + UnicodeScalar(0xFFFD)! + case let c where UTF16.CodeUnit(exactly: c).map({ UTF16.isSurrogate($0) }) != nil: + // FIXME: log_parse_error() + UnicodeScalar(0xFFFD)! + case let c where UnicodeScalar(c)! == "\u{000D}" || HTMLTokenizer.controlNotAsciiWhitespace.contains(UnicodeScalar(c)!): + // FIXME: log_parse_error() + if let codePoint = HTMLTokenizer.characterReferenceControlCodeMapping[c] { + codePoint + } else { + UnicodeScalar(c)! + } + case let c where HTMLTokenizer.nonCharacter.contains(UnicodeScalar(c)!): + // FIXME: log_parse_error() + UnicodeScalar(c)! + default: + UnicodeScalar(self.characterReferenceCode)! + } + + self.temporaryBuffer = Swift.String(Character(codePoint)) + self.flushCodepointsConsumedAsACharacterReference() + return switchTo(self.returnState) } } } diff --git a/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.cpp b/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.cpp index 888e54f28f2..6c46faba4d6 100644 --- a/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.cpp +++ b/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.cpp @@ -20,4 +20,12 @@ OptionalString decode_to_utf8(StringView text, StringView encoding) return decoded_or_error.release_value(); } +OptionalEntityMatch match_entity_for_named_character_reference(StringView entity) +{ + auto entity_match = code_points_from_entity(entity); + if (entity_match.has_value()) + return entity_match.release_value(); + return std::nullopt; +} + } diff --git a/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.h b/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.h index 3902cb29ea7..35a2de3250b 100644 --- a/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.h +++ b/Libraries/LibWeb/HTML/Parser/HTMLTokenizerHelpers.h @@ -8,6 +8,7 @@ #include #include +#include #include namespace Web::HTML { @@ -16,4 +17,8 @@ namespace Web::HTML { using OptionalString = std::optional; OptionalString decode_to_utf8(StringView text, StringView encoding); +// Swift-friendly wrapper for HTML::code_points_from_entity +using OptionalEntityMatch = std::optional; +OptionalEntityMatch match_entity_for_named_character_reference(StringView entity); + } diff --git a/Tests/LibWeb/TestHTMLTokenizerSwift.swift b/Tests/LibWeb/TestHTMLTokenizerSwift.swift index e773023efcf..0d0ce0434bc 100644 --- a/Tests/LibWeb/TestHTMLTokenizerSwift.swift +++ b/Tests/LibWeb/TestHTMLTokenizerSwift.swift @@ -69,11 +69,13 @@ struct TestHTMLTokenizerSwift { #expect(tokenizer.state == HTMLTokenizer.State.Data) // initial state let token = tokenizer.nextToken() - #expect(token?.type == .EndOfFile) - #expect(tokenizer.state == HTMLTokenizer.State.CharacterReference) + #expect(token?.type == .Character(codePoint: "&")) let token2 = tokenizer.nextToken() - #expect(token2 == nil) + #expect(token2?.type == .EndOfFile) + + let token3 = tokenizer.nextToken() + #expect(token3 == nil) } @Test func tagOpenOnly() {