import { TODO, VERIFY } from "../util/assertions.js"; import { ParseError } from "./errors.js"; import { entities } from "./tokenizer/entities.js"; import { State } from "./tokenizer/state.js"; import { AttributeList, Token, Type } from "./tokenizer/token.js"; export class Tokenizer { private state: State = State.Data; private returnState!: State; private temporaryBuffer!: string; private currentToken!: Token; private currentInputCharacter!: string; public tokens: Array = new Array(); private pointer: number = 0; public constructor(private input: string) { } public spin(): void { switch (this.state) { case State.Data: { switch (this.consumeNext()) { case '\u0026': this.returnState = State.Data; this.state = State.CharacterReference; break; case '\u003C': this.state = State.TagOpen; break; case '\u0000': this.parseError('unexpected-null-character'); this.emit({ type: Type.Character, data: this.currentInputCharacter }); break; case undefined: this.emit({ type: Type.EndOfFile }); break; default: this.emit({ type: Type.Character, data: this.currentInputCharacter }); } break; } case State.RCDATA: { switch (this.consumeNext()) { case '\u003C': this.state = State.RAWTEXTLessThan; break; case '\u0000': this.parseError('unexpected-null-character'); this.emit({ type: Type.Character, data: '\uFFFD' }); break; case undefined: this.emit({ type: Type.EndOfFile }); break; default: this.emit({ type: Type.Character, data: this.currentInputCharacter }); } break; } case State.TagOpen: { switch (this.consumeNext()) { case '\u0021': this.state = State.MarkupDeclarationOpen; break; case '\u002F': this.state = State.EndTagOpen; break; case '\u003F': this.parseError('unexpected-question-mark-instead-of-tag-name'); this.create({ type: Type.Comment, data: '' }); this.reconsumeIn(State.BogusComment); break; case undefined: this.parseError('eof-before-tag-name'); this.emit({ type: Type.Character, data: '\u003C' }); this.emit({ type: Type.EndOfFile }); break; default: { if (this.asciiAlpha(this.currentInputCharacter)) { this.create({ type: Type.StartTag, name: '', attributes: new AttributeList() }); this.reconsumeIn(State.TagName); break; } this.parseError('invalid-first-character-of-tag-name'); this.emit({ type: Type.Character, data: '\u003C' }); this.reconsumeIn(State.Data); } } break; } case State.EndTagOpen: { switch (this.consumeNext()) { case '\u003E': this.parseError('missing-end-tag-name'); this.state = State.Data; break; case undefined: this.parseError('eof-before-tag-name'); this.emit({ type: Type.Character, data: '\u003C' }); this.emit({ type: Type.Character, data: '\u002F' }); this.emit({ type: Type.EndOfFile }); break; default: { if (this.asciiAlpha(this.currentInputCharacter)) { this.create({ type: Type.EndTag, name: '', attributes: new AttributeList() }); this.reconsumeIn(State.TagName); break; } this.parseError('invalid-first-character-of-tag-name'); this.create({ type: Type.Comment, data: '' }); this.reconsumeIn(State.BogusComment); } } break; } case State.MarkupDeclarationOpen: { if (this.matchNextFew('--')) { this.consumeNextFew('--'); this.create({ type: Type.Comment, data: '' }); this.state = State.CommentStart; } else if (this.matchNextFewCaseInsensitive('DOCTYPE')) { this.consumeNextFewCaseInsensitive('DOCTYPE'); this.state = State.DOCTYPE; } else if (this.matchNextFew('[CDATA[')) { this.consumeNextFew('[CDATA['); // NOTE: This parser will never be generated as part of the fragment parsing algorithm, as such the CDATA section state does not // exist and will not be started here. this.parseError('cdata-in-html-content'); this.create({ type: Type.Comment, data: '[CDATA[' }); this.state = State.BogusComment; } else { this.parseError('incorrectly-opened-comment'); this.create({ type: Type.Comment, data: '' }); this.state = State.BogusComment; } break; } case State.DOCTYPE: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': this.state = State.BeforeDOCTYPEName; break; case '\u003E': this.reconsumeIn(State.BeforeDOCTYPEName); break; case undefined: this.parseError('eof-in-doctype'); this.emit({ type: Type.DOCTYPE, forceQuirks: true }); this.emit({ type: Type.EndOfFile }); break; default: this.parseError('missing-whitespace-before-doctype-name'); this.reconsumeIn(State.BeforeDOCTYPEName); } break; } case State.BeforeDOCTYPEName: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': break; case '\u0000': this.parseError('unexpected-null-character'); this.create({ type: Type.DOCTYPE, name: '\uFFFD' }); this.state = State.DOCTYPEName; break; case undefined: this.parseError('eof-in-doctype'); this.emit({ type: Type.DOCTYPE, forceQuirks: true }); this.emit({ type: Type.EndOfFile }); break; default: { if (this.asciiUpperAlpha(this.currentInputCharacter)) { this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter.toLowerCase()}); this.state = State.DOCTYPEName; break; } this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter }); this.state = State.DOCTYPE; } } break; } case State.DOCTYPEName: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': this.state = State.AfterDOCTYPEName; break; case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.DOCTYPE); break; case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.DOCTYPE)!.name += '\uFFFD'; break; case undefined: this.parseError('eof-in-doctype'); this.currentOfType(Type.DOCTYPE).forceQuirks = true; this.emitCurrentOfType(Type.DOCTYPE); this.emit({ type: Type.EndOfFile }); break; default: { if (this.asciiUpperAlpha(this.currentInputCharacter)) { this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter.toLowerCase(); break; } this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter; } } break; } case State.TagName: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': this.state = State.BeforeAttributeName; break; case '\u002F': this.state = State.SelfClosingStartTag; break; case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break; case '\u0000': this.parseError('unexpected-null-character'); this.currentOfEitherType(Type.StartTag, Type.EndTag).name += '\uFFFD'; break; case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break; default: { if (this.asciiUpperAlpha(this.currentInputCharacter)) { this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter.toLowerCase(); break; } this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter; } } break; } case State.BeforeAttributeName: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': break; case '\u002F': case '\u003E': case undefined: this.reconsumeIn(State.AfterAttributeName); break; case '\u003D': { this.parseError('unexpected-equals-sign-before-attribute-name'); this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: this.currentInputCharacter, value: '' }); this.state = State.AttributeName; break; } default: { this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' }); this.reconsumeIn(State.AttributeName); } } break; } case State.AttributeName: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': case '\u002F': case '\u003E': case undefined: this.reconsumeIn(State.AfterAttributeName); break; case '\u003D': this.state = State.BeforeAttributeValue; break; case '\u0000': this.parseError('unexpected-null-character'); this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += '\uFFFD'; break; case '\u0022': case '\u0027': case '\u003C': this.parseError('unexpected-character-in-attribute-name'); this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter; break; default: { if (this.asciiUpperAlpha(this.currentInputCharacter)) { this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter.toLowerCase(); break; } this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter; } } break; } case State.AfterAttributeName: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': break; case '\u002F': this.state = State.SelfClosingStartTag; break; case '\u003D': this.state = State.BeforeAttributeValue; break; case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break; case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break; default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' }); this.reconsumeIn(State.AttributeName); break; } break; } case State.BeforeAttributeValue: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': break; case '\u0022': this.state = State.AttributeValueDouble; break; case '\u0027': this.state = State.AttributeValueSingle; break; case '\u003E': this.parseError('missing-attribute-value'); this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break; default: this.reconsumeIn(State.AttributeValueUnquoted); } break; } case State.AttributeValueDouble: { switch (this.consumeNext()) { case '\u0022': this.state = State.AfterAttributeValue; break; case '\u0026': this.returnState = State.AttributeValueDouble; this.state = State.CharacterReference; break; case '\u0000': this.parseError('unexpected-null-character'); this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD'; break; case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break; default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter; } break; } case State.AttributeValueSingle: { switch (this.consumeNext()) { case '\u0027': this.state = State.AfterAttributeValue; break; case '\u0026': this.returnState = State.AttributeValueSingle; this.state = State.CharacterReference; break; case '\u0000': this.parseError('unexpected-null-character'); this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD'; break; case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break; default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter; } break; } case State.AttributeValueUnquoted: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': this.state = State.BeforeAttributeName; break; case '\u0026': this.returnState = State.AttributeValueUnquoted; this.state = State.CharacterReference; break; case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break; case '\u0000': this.parseError('unexpected-null-character'); this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD'; break; case '\u0022': case '\u0027': case '\u003C': case '\u003D': case '\u0060': this.parseError('unexpected-character-in-unquoted-attribute-value'); this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter; break; case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break; default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter; } break; } case State.AfterAttributeValue: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': this.state = State.BeforeAttributeName; break; case '\u002F': this.state = State.SelfClosingStartTag; break; case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break; case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break; default: this.parseError('missing-whitespace-between-attributes'); this.reconsumeIn(State.BeforeAttributeName); } break; } case State.CommentStart: { switch (this.consumeNext()) { case '\u002D': this.state = State.CommentStartDash; break; case '\u003E': this.parseError('abrupt-closing-of-empty-comment'); this.state = State.Data; this.emitCurrentOfType(Type.Comment); break; default: this.reconsumeIn(State.Comment); } break; } // FIXME: Possible improvement to https://html.spec.whatwg.org/multipage/parsing.html#comment-state (adding **current** in some places) case State.Comment: { switch (this.consumeNext()) { case '\u003C': this.currentOfType(Type.Comment).data += this.currentInputCharacter; this.state = State.CommentLessThanSign; break; case '\u002D': this.state = State.CommentEndDash; break; case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break; case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break; default: this.currentOfType(Type.Comment).data += this.currentInputCharacter; } break; } case State.CommentEndDash: { switch (this.consumeNext()) { case '\u002D': this.state = State.CommentEnd; break; case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break; default: this.currentOfType(Type.Comment).data += '\u002D'; this.reconsumeIn(State.Comment); } break; } // Same as above fixme https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state case State.CommentEnd: { switch (this.consumeNext()) { case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break; case '\u0021': this.state = State.CommentEndBang; break; case '\u002D': this.currentOfType(Type.Comment).data += '\u002D'; break; case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break; default: this.currentOfType(Type.Comment).data += '\u002D\u002D'; this.reconsumeIn(State.Comment); } break; } // Same as above https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state case State.BogusComment: { switch (this.consumeNext()) { case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break; case undefined: this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break; case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break; default: this.currentOfType(Type.Comment).data += this.currentInputCharacter; } break; } case State.CharacterReference: { this.temporaryBuffer = ''; this.temporaryBuffer += '\u0026'; switch (this.consumeNext()) { case '\u0023': this.temporaryBuffer += this.currentInputCharacter; this.state = State.NumericCharacterReference; break; default: { if (this.asciiAlphanumeric(this.currentInputCharacter)) { this.reconsumeIn(State.NamedCharacterReference); break; } this.flushCodePointsConsumedAsCharacterReference(); this.reconsumeIn(this.returnState); } } break; } case State.NamedCharacterReference: { let match = false; for (const entry in entities) { if (this.matchNextFew(entry)) { match = true; this.consumeNextFew(entry); this.temporaryBuffer += entry; if (this.consumedAsPartOfAnAttribute() && entry[entry.length - 1] !== '\u003B' && (this.next() === '\u003D' || this.asciiAlphanumeric(this.next() ?? ''))) { this.flushCodePointsConsumedAsCharacterReference(); this.state = this.returnState; break; } if (entry[entry.length - 1] !== '\u003B') this.parseError('missing-semicolon-after-character-reference'); this.temporaryBuffer = ''; this.temporaryBuffer += entities[entry].characters; this.flushCodePointsConsumedAsCharacterReference(); this.state = this.returnState; break; } } if (!match) { this.flushCodePointsConsumedAsCharacterReference(); this.state = State.AmbiguousAmpersand; } break; } case State.AmbiguousAmpersand: { switch (this.consumeNext()) { case '\u003B': this.parseError('unknown-named-character-reference'); this.reconsumeIn(this.returnState); break; default: { if (this.asciiAlphanumeric(this.currentInputCharacter)) { if (this.consumedAsPartOfAnAttribute()) { this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter; } else { this.emit({ type: Type.Character, data: this.currentInputCharacter }); } break; } this.reconsumeIn(this.returnState); } } break; } default: TODO(`Tokenizer#iterate, Unimplemented state '${this.state}'`); } } private flushCodePointsConsumedAsCharacterReference(): void { if (this.consumedAsPartOfAnAttribute()) { this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.temporaryBuffer; return; } for (const codePoint of this.temporaryBuffer) this.emit({ type: Type.Character, data: codePoint }); } private consumedAsPartOfAnAttribute(): boolean { return this.returnState === State.AttributeValueDouble || this.returnState === State.AttributeValueSingle || this.returnState === State.AttributeValueUnquoted; } private asciiAlphanumeric(input: string): boolean { return this.asciiAlpha(input) || this.asciiDigit(input); } private asciiAlpha(input: string): boolean { return this.asciiUpperAlpha(input) || this.asciiLowerAlpha(input); } private asciiUpperAlpha(input: string): boolean { return /[^\u0041-\u005A]/.test(input); } private asciiLowerAlpha(input: string): boolean { return /[^\u0061-\u007A]/.test(input); } private asciiDigit(input: string): boolean { return /[^\u0030-\u0030]/.test(input); } private reconsumeIn(state: State): void { this.pointer--; this.state = state; this.spin(); } private parseError(error: ParseError): void { console.error('Parse error: ' + error); } private consumeNext(): string | undefined { this.currentInputCharacter = this.input[this.pointer]; this.pointer++; return this.currentInputCharacter; } private next(): string | undefined { return this.input[this.pointer]; } private matchNextFew(input: string): boolean { return this.input.substr(this.pointer, input.length) === input; } private matchNextFewCaseInsensitive(input: string): boolean { return this.input.substr(this.pointer, input.length).toLowerCase() === input.toLowerCase(); } private consumeNextFew(input: string): void { for (let i = 0; i < input.length; i++) { const consumed = this.consumeNext(); VERIFY(consumed === input[i], `Tokenizer#consumeNextFew: Expected '${input[i]}' (${input} at ${i}), got ${consumed} instead`); } } private consumeNextFewCaseInsensitive(input: string): void { for (let i = 0; i < input.length; i++) { const consumed = this.consumeNext()?.toLowerCase(); VERIFY(consumed === input[i].toLowerCase(), `Tokenizer#consumeNextFewCaseInsensitive: Expected '${input[i].toLowerCase()}' (${input.toLowerCase()} at ${i}), got ${consumed} instead`); } } private emit(token: Token): void { this.tokens.push(token); } private emitCurrentOfType(type: Type): void { VERIFY(this.currentToken.type === type, `Tokenizer#emitCurrentOfType: Expected '${type}', got '${this.currentToken.type}' instead`); this.tokens.push(this.currentToken); } private emitCurrentOfEitherType(a: Type, b: Type): void { VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Tokenizer#emitCurrentOfEitherType: Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`); this.tokens.push(this.currentToken); } private currentOfType(type: T): Token & { type: T } { VERIFY(this.currentToken.type === type, `Tokenizer#currentOfType: Expected '${type}', got '${this.currentToken.type}' instead`); return this.currentToken as Token & { type: T }; } private currentOfEitherType(a: T, b: U): Token & { type: T | U } { VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Tokenizer#currentOfEitherType: Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`); return this.currentToken as Token & { type: T }; } private create(token: Token): Token { return this.currentToken = token; } }