nwex.de/html/tokenizer.ts
networkException 5afae11193 Everywhere: Replace calls to console.assert and throws with assertions
This patch replaces console.assert calls and random throw new Error
calls to crash on undefined behavior with predefined assertion
functions from utils.
2021-10-24 23:06:35 +02:00

629 lines
27 KiB
TypeScript

import { TODO, VERIFY } from "../util/assertions.js";
import { ParseError } from "./errors.js";
import { entities } from "./tokenizer/entities.js";
import { State } from "./tokenizer/state.js";
import { AttributeList, Token, Type } from "./tokenizer/token.js";
export class Tokenizer {
private state: State = State.Data;
private returnState!: State;
private temporaryBuffer!: string;
private currentToken!: Token;
private currentInputCharacter!: string;
public tokens: Array<Token> = new Array<Token>();
private pointer: number = 0;
public constructor(private input: string) {
}
public spin(): void {
switch (this.state) {
case State.Data: {
switch (this.consumeNext()) {
case '\u0026':
this.returnState = State.Data;
this.state = State.CharacterReference;
break;
case '\u003C': this.state = State.TagOpen; break;
case '\u0000':
this.parseError('unexpected-null-character');
this.emit({ type: Type.Character, data: this.currentInputCharacter });
break;
case undefined: this.emit({ type: Type.EndOfFile }); break;
default: this.emit({ type: Type.Character, data: this.currentInputCharacter });
}
break;
}
case State.RCDATA: {
switch (this.consumeNext()) {
case '\u003C': this.state = State.RAWTEXTLessThan; break;
case '\u0000': this.parseError('unexpected-null-character'); this.emit({ type: Type.Character, data: '\uFFFD' }); break;
case undefined: this.emit({ type: Type.EndOfFile }); break;
default: this.emit({ type: Type.Character, data: this.currentInputCharacter });
}
break;
}
case State.TagOpen: {
switch (this.consumeNext()) {
case '\u0021': this.state = State.MarkupDeclarationOpen; break;
case '\u002F': this.state = State.EndTagOpen; break;
case '\u003F':
this.parseError('unexpected-question-mark-instead-of-tag-name');
this.create({ type: Type.Comment, data: '' });
this.reconsumeIn(State.BogusComment);
break;
case undefined:
this.parseError('eof-before-tag-name');
this.emit({ type: Type.Character, data: '\u003C' });
this.emit({ type: Type.EndOfFile });
break;
default: {
if (this.asciiAlpha(this.currentInputCharacter)) {
this.create({ type: Type.StartTag, name: '', attributes: new AttributeList() });
this.reconsumeIn(State.TagName);
break;
}
this.parseError('invalid-first-character-of-tag-name');
this.emit({ type: Type.Character, data: '\u003C' });
this.reconsumeIn(State.Data);
}
}
break;
}
case State.EndTagOpen: {
switch (this.consumeNext()) {
case '\u003E': this.parseError('missing-end-tag-name'); this.state = State.Data; break;
case undefined:
this.parseError('eof-before-tag-name');
this.emit({ type: Type.Character, data: '\u003C' });
this.emit({ type: Type.Character, data: '\u002F' });
this.emit({ type: Type.EndOfFile });
break;
default: {
if (this.asciiAlpha(this.currentInputCharacter)) {
this.create({ type: Type.EndTag, name: '', attributes: new AttributeList() });
this.reconsumeIn(State.TagName);
break;
}
this.parseError('invalid-first-character-of-tag-name');
this.create({ type: Type.Comment, data: '' });
this.reconsumeIn(State.BogusComment);
}
}
break;
}
case State.MarkupDeclarationOpen: {
if (this.matchNextFew('--')) {
this.consumeNextFew('--');
this.create({ type: Type.Comment, data: '' });
this.state = State.CommentStart;
} else if (this.matchNextFewCaseInsensitive('DOCTYPE')) {
this.consumeNextFewCaseInsensitive('DOCTYPE');
this.state = State.DOCTYPE;
} else if (this.matchNextFew('[CDATA[')) {
this.consumeNextFew('[CDATA[');
// NOTE: This parser will never be generated as part of the fragment parsing algorithm, as such the CDATA section state does not
// exist and will not be started here.
this.parseError('cdata-in-html-content');
this.create({ type: Type.Comment, data: '[CDATA[' });
this.state = State.BogusComment;
} else {
this.parseError('incorrectly-opened-comment');
this.create({ type: Type.Comment, data: '' });
this.state = State.BogusComment;
}
break;
}
case State.DOCTYPE: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.BeforeDOCTYPEName; break;
case '\u003E': this.reconsumeIn(State.BeforeDOCTYPEName); break;
case undefined:
this.parseError('eof-in-doctype');
this.emit({ type: Type.DOCTYPE, forceQuirks: true });
this.emit({ type: Type.EndOfFile });
break;
default:
this.parseError('missing-whitespace-before-doctype-name');
this.reconsumeIn(State.BeforeDOCTYPEName);
}
break;
}
case State.BeforeDOCTYPEName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': break;
case '\u0000':
this.parseError('unexpected-null-character');
this.create({ type: Type.DOCTYPE, name: '\uFFFD' });
this.state = State.DOCTYPEName;
break;
case undefined:
this.parseError('eof-in-doctype');
this.emit({ type: Type.DOCTYPE, forceQuirks: true });
this.emit({ type: Type.EndOfFile });
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter.toLowerCase()});
this.state = State.DOCTYPEName;
break;
}
this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter });
this.state = State.DOCTYPE;
}
}
break;
}
case State.DOCTYPEName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.AfterDOCTYPEName; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.DOCTYPE); break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.DOCTYPE)!.name += '\uFFFD'; break;
case undefined:
this.parseError('eof-in-doctype');
this.currentOfType(Type.DOCTYPE).forceQuirks = true;
this.emitCurrentOfType(Type.DOCTYPE);
this.emit({ type: Type.EndOfFile });
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter.toLowerCase();
break;
}
this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter;
}
}
break;
}
case State.TagName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).name += '\uFFFD';
break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter.toLowerCase();
break;
}
this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter;
}
}
break;
}
case State.BeforeAttributeName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': break;
case '\u002F':
case '\u003E':
case undefined: this.reconsumeIn(State.AfterAttributeName); break;
case '\u003D': {
this.parseError('unexpected-equals-sign-before-attribute-name');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: this.currentInputCharacter, value: '' });
this.state = State.AttributeName;
break;
}
default: {
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' });
this.reconsumeIn(State.AttributeName);
}
}
break;
}
case State.AttributeName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020':
case '\u002F':
case '\u003E':
case undefined: this.reconsumeIn(State.AfterAttributeName); break;
case '\u003D': this.state = State.BeforeAttributeValue; break;
case '\u0000': this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += '\uFFFD';
break;
case '\u0022':
case '\u0027':
case '\u003C':
this.parseError('unexpected-character-in-attribute-name');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter;
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter.toLowerCase();
break;
}
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter;
}
}
break;
}
case State.AfterAttributeName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
case '\u003D': this.state = State.BeforeAttributeValue; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default:
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' });
this.reconsumeIn(State.AttributeName);
break;
}
break;
}
case State.BeforeAttributeValue: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': break;
case '\u0022': this.state = State.AttributeValueDouble; break;
case '\u0027': this.state = State.AttributeValueSingle; break;
case '\u003E':
this.parseError('missing-attribute-value');
this.state = State.Data;
this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag);
break;
default:
this.reconsumeIn(State.AttributeValueUnquoted);
}
break;
}
case State.AttributeValueDouble: {
switch (this.consumeNext()) {
case '\u0022': this.state = State.AfterAttributeValue; break;
case '\u0026': this.returnState = State.AttributeValueDouble; this.state = State.CharacterReference; break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
}
break;
}
case State.AttributeValueSingle: {
switch (this.consumeNext()) {
case '\u0027': this.state = State.AfterAttributeValue; break;
case '\u0026': this.returnState = State.AttributeValueSingle; this.state = State.CharacterReference; break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
}
break;
}
case State.AttributeValueUnquoted: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u0026': this.returnState = State.AttributeValueUnquoted; this.state = State.CharacterReference; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
break;
case '\u0022':
case '\u0027':
case '\u003C':
case '\u003D':
case '\u0060':
this.parseError('unexpected-character-in-unquoted-attribute-value');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
}
break;
}
case State.AfterAttributeValue: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: this.parseError('missing-whitespace-between-attributes'); this.reconsumeIn(State.BeforeAttributeName);
}
break;
}
case State.CommentStart: {
switch (this.consumeNext()) {
case '\u002D': this.state = State.CommentStartDash; break;
case '\u003E': this.parseError('abrupt-closing-of-empty-comment'); this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
default: this.reconsumeIn(State.Comment);
}
break;
}
// FIXME: Possible improvement to https://html.spec.whatwg.org/multipage/parsing.html#comment-state (adding **current** in some places)
case State.Comment: {
switch (this.consumeNext()) {
case '\u003C': this.currentOfType(Type.Comment).data += this.currentInputCharacter; this.state = State.CommentLessThanSign; break;
case '\u002D': this.state = State.CommentEndDash; break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfType(Type.Comment).data += this.currentInputCharacter;
}
break;
}
case State.CommentEndDash: {
switch (this.consumeNext()) {
case '\u002D': this.state = State.CommentEnd; break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfType(Type.Comment).data += '\u002D'; this.reconsumeIn(State.Comment);
}
break;
}
// Same as above fixme https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
case State.CommentEnd: {
switch (this.consumeNext()) {
case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
case '\u0021': this.state = State.CommentEndBang; break;
case '\u002D': this.currentOfType(Type.Comment).data += '\u002D'; break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfType(Type.Comment).data += '\u002D\u002D'; this.reconsumeIn(State.Comment);
}
break;
}
// Same as above https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
case State.BogusComment: {
switch (this.consumeNext()) {
case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
case undefined: this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break;
default: this.currentOfType(Type.Comment).data += this.currentInputCharacter;
}
break;
}
case State.CharacterReference: {
this.temporaryBuffer = '';
this.temporaryBuffer += '\u0026';
switch (this.consumeNext()) {
case '\u0023': this.temporaryBuffer += this.currentInputCharacter; this.state = State.NumericCharacterReference; break;
default: {
if (this.asciiAlphanumeric(this.currentInputCharacter)) {
this.reconsumeIn(State.NamedCharacterReference);
break;
}
this.flushCodePointsConsumedAsCharacterReference();
this.reconsumeIn(this.returnState);
}
}
break;
}
case State.NamedCharacterReference: {
let match = false;
for (const entry in entities) {
if (this.matchNextFew(entry)) {
match = true;
this.consumeNextFew(entry);
this.temporaryBuffer += entry;
if (this.consumedAsPartOfAnAttribute() && entry[entry.length - 1] !== '\u003B' && (this.next() === '\u003D' || this.asciiAlphanumeric(this.next() ?? ''))) {
this.flushCodePointsConsumedAsCharacterReference();
this.state = this.returnState;
break;
}
if (entry[entry.length - 1] !== '\u003B')
this.parseError('missing-semicolon-after-character-reference');
this.temporaryBuffer = '';
this.temporaryBuffer += entities[entry].characters;
this.flushCodePointsConsumedAsCharacterReference();
this.state = this.returnState;
break;
}
}
if (!match) {
this.flushCodePointsConsumedAsCharacterReference();
this.state = State.AmbiguousAmpersand;
}
break;
}
case State.AmbiguousAmpersand: {
switch (this.consumeNext()) {
case '\u003B': this.parseError('unknown-named-character-reference'); this.reconsumeIn(this.returnState); break;
default: {
if (this.asciiAlphanumeric(this.currentInputCharacter)) {
if (this.consumedAsPartOfAnAttribute()) {
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
} else {
this.emit({ type: Type.Character, data: this.currentInputCharacter });
}
break;
}
this.reconsumeIn(this.returnState);
}
}
break;
}
default: TODO(`Tokenizer#iterate, Unimplemented state '${this.state}'`);
}
}
private flushCodePointsConsumedAsCharacterReference(): void {
if (this.consumedAsPartOfAnAttribute()) {
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.temporaryBuffer;
return;
}
for (const codePoint of this.temporaryBuffer)
this.emit({ type: Type.Character, data: codePoint });
}
private consumedAsPartOfAnAttribute(): boolean {
return this.returnState === State.AttributeValueDouble || this.returnState === State.AttributeValueSingle || this.returnState === State.AttributeValueUnquoted;
}
private asciiAlphanumeric(input: string): boolean {
return this.asciiAlpha(input) || this.asciiDigit(input);
}
private asciiAlpha(input: string): boolean {
return this.asciiUpperAlpha(input) || this.asciiLowerAlpha(input);
}
private asciiUpperAlpha(input: string): boolean {
return /[^\u0041-\u005A]/.test(input);
}
private asciiLowerAlpha(input: string): boolean {
return /[^\u0061-\u007A]/.test(input);
}
private asciiDigit(input: string): boolean {
return /[^\u0030-\u0030]/.test(input);
}
private reconsumeIn(state: State): void {
this.pointer--;
this.state = state;
this.spin();
}
private parseError(error: ParseError): void {
console.error('Parse error: ' + error);
}
private consumeNext(): string | undefined {
this.currentInputCharacter = this.input[this.pointer];
this.pointer++;
return this.currentInputCharacter;
}
private next(): string | undefined {
return this.input[this.pointer];
}
private matchNextFew(input: string): boolean {
return this.input.substr(this.pointer, input.length) === input;
}
private matchNextFewCaseInsensitive(input: string): boolean {
return this.input.substr(this.pointer, input.length).toLowerCase() === input.toLowerCase();
}
private consumeNextFew(input: string): void {
for (let i = 0; i < input.length; i++) {
const consumed = this.consumeNext();
VERIFY(consumed === input[i], `Tokenizer#consumeNextFew: Expected '${input[i]}' (${input} at ${i}), got ${consumed} instead`);
}
}
private consumeNextFewCaseInsensitive(input: string): void {
for (let i = 0; i < input.length; i++) {
const consumed = this.consumeNext()?.toLowerCase();
VERIFY(consumed === input[i].toLowerCase(),
`Tokenizer#consumeNextFewCaseInsensitive: Expected '${input[i].toLowerCase()}' (${input.toLowerCase()} at ${i}), got ${consumed} instead`);
}
}
private emit(token: Token): void {
this.tokens.push(token);
}
private emitCurrentOfType(type: Type): void {
VERIFY(this.currentToken.type === type, `Tokenizer#emitCurrentOfType: Expected '${type}', got '${this.currentToken.type}' instead`);
this.tokens.push(this.currentToken);
}
private emitCurrentOfEitherType(a: Type, b: Type): void {
VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Tokenizer#emitCurrentOfEitherType: Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`);
this.tokens.push(this.currentToken);
}
private currentOfType<T extends Type>(type: T): Token & { type: T } {
VERIFY(this.currentToken.type === type, `Tokenizer#currentOfType: Expected '${type}', got '${this.currentToken.type}' instead`);
return this.currentToken as Token & { type: T };
}
private currentOfEitherType<T extends Type, U extends Type>(a: T, b: U): Token & { type: T | U } {
VERIFY(this.currentToken.type === a || this.currentToken.type === b,
`Tokenizer#currentOfEitherType: Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`);
return this.currentToken as Token & { type: T };
}
private create(token: Token): Token {
return this.currentToken = token;
}
}