nwex.de/html/tokenizer.ts

657 lines
29 KiB
TypeScript

import { TODO, VERIFY, VERIFY_NOT_REACHED } from "../util/assertions.js";
import { Constructor } from "../util/guards.js";
import { ParseError } from "./errors.js";
import { entities } from "./tokenizer/entities.js";
import { State } from "./tokenizer/state.js";
import { Attribute, CharacterToken, CommentToken, DOCTYPEToken, EndOfFileToken, EndTagToken, Position, StartTagToken, Token } from "./tokenizer/token.js";
export class Tokenizer {
private state: State = State.Data;
private returnState!: State;
private temporaryBuffer!: string;
private currentToken!: Token;
private currentInputCharacter!: string;
private currentPosition: Position = { line: 0, column: 0, index: 0 };
public tokens: Array<Token> = new Array<Token>();
private pointer: number = 0;
public constructor(private input: string) {
}
public spin(): void {
switch (this.state) {
case State.Data: {
switch (this.consumeNext()) {
case '\u0026':
this.returnState = State.Data;
this.state = State.CharacterReference;
break;
case '\u003C': this.state = State.TagOpen; break;
case '\u0000':
this.parseError('unexpected-null-character');
this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition));
break;
case undefined: this.emit(EndOfFileToken.create()); break;
default: this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition));
}
break;
}
case State.RCDATA: {
switch (this.consumeNext()) {
case '\u003C': this.state = State.RAWTEXTLessThan; break;
case '\u0000': this.parseError('unexpected-null-character'); this.emit(CharacterToken.createReplacementCharacter().at(this.currentPosition)); break;
case undefined: this.emit(EndOfFileToken.create()); break;
default: this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition));
}
break;
}
case State.TagOpen: {
switch (this.consumeNext()) {
case '\u0021': this.state = State.MarkupDeclarationOpen; break;
case '\u002F': this.state = State.EndTagOpen; break;
case '\u003F':
this.parseError('unexpected-question-mark-instead-of-tag-name');
this.create(CommentToken.createEmpty().startingAt(this.currentPosition));
this.reconsumeIn(State.BogusComment);
break;
case undefined:
this.parseError('eof-before-tag-name');
this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition));
this.emit(EndOfFileToken.create());
break;
default: {
if (this.asciiAlpha(this.currentInputCharacter)) {
this.create(StartTagToken.createEmpty().startingAt(this.currentPosition));
this.reconsumeIn(State.TagName);
break;
}
this.parseError('invalid-first-character-of-tag-name');
this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition));
this.reconsumeIn(State.Data);
}
}
break;
}
case State.EndTagOpen: {
switch (this.consumeNext()) {
case '\u003E': this.parseError('missing-end-tag-name'); this.state = State.Data; break;
case undefined:
this.parseError('eof-before-tag-name');
this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition));
this.emit(CharacterToken.createWith('\u002F').at(this.currentPosition));
this.emit(EndOfFileToken.create());
break;
default: {
if (this.asciiAlpha(this.currentInputCharacter)) {
this.create(EndTagToken.createEmpty().startingAt(this.currentPosition));
this.reconsumeIn(State.TagName);
break;
}
this.parseError('invalid-first-character-of-tag-name');
this.create(CommentToken.createEmpty().startingAt(this.currentPosition));
this.reconsumeIn(State.BogusComment);
}
}
break;
}
case State.MarkupDeclarationOpen: {
if (this.matchNextFew('--')) {
this.consumeNextFew('--');
this.create(CommentToken.createEmpty().startingAt(this.currentPosition));
this.state = State.CommentStart;
} else if (this.matchNextFewCaseInsensitive('DOCTYPE')) {
this.consumeNextFewCaseInsensitive('DOCTYPE');
this.state = State.DOCTYPE;
} else if (this.matchNextFew('[CDATA[')) {
this.consumeNextFew('[CDATA[');
// NOTE: This parser will never be generated as part of the fragment parsing algorithm, as such the CDATA section state does not
// exist and will not be started here.
this.parseError('cdata-in-html-content');
this.create(CommentToken.createWith('[CDATA[').startingAt(this.currentPosition));
this.state = State.BogusComment;
} else {
this.parseError('incorrectly-opened-comment');
this.create(CommentToken.createEmpty().startingAt(this.currentPosition));
this.state = State.BogusComment;
}
break;
}
case State.DOCTYPE: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.BeforeDOCTYPEName; break;
case '\u003E': this.reconsumeIn(State.BeforeDOCTYPEName); break;
case undefined:
this.parseError('eof-in-doctype');
this.emit(DOCTYPEToken.createWithForcedQuirks().at(this.currentPosition));
this.emit(EndOfFileToken.create());
break;
default:
this.parseError('missing-whitespace-before-doctype-name');
this.reconsumeIn(State.BeforeDOCTYPEName);
}
break;
}
case State.BeforeDOCTYPEName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': break;
case '\u0000':
this.parseError('unexpected-null-character');
this.create(DOCTYPEToken.createWithName('\uFFFD').startingAt(this.currentPosition));
this.state = State.DOCTYPEName;
break;
case undefined:
this.parseError('eof-in-doctype');
this.emit(DOCTYPEToken.createWithForcedQuirks().at(this.currentPosition));
this.emit(EndOfFileToken.create());
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.create(DOCTYPEToken.createWithName(this.currentInputCharacter.toLowerCase()).startingAt(this.currentPosition));
this.state = State.DOCTYPEName;
break;
}
this.create(DOCTYPEToken.createWithName(this.currentInputCharacter).startingAt(this.currentPosition));
this.state = State.DOCTYPE;
}
}
break;
}
case State.DOCTYPEName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.AfterDOCTYPEName; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfType(DOCTYPEToken); break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(DOCTYPEToken).appendReplacementCharacterToName(); break;
case undefined:
this.parseError('eof-in-doctype');
this.currentOfType(DOCTYPEToken).forceQuirks = true;
this.emitCurrentOfType(DOCTYPEToken);
this.emit(EndOfFileToken.create());
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter.toLowerCase());
break;
}
this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter);
}
}
break;
}
case State.TagName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(StartTagToken, EndTagToken).appendReplacementCharacterToName();
break;
case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter.toLowerCase());
break;
}
this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter);
}
}
break;
}
case State.BeforeAttributeName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': break;
case '\u002F':
case '\u003E':
case undefined: this.reconsumeIn(State.AfterAttributeName); break;
case '\u003D': {
this.parseError('unexpected-equals-sign-before-attribute-name');
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyValue(this.currentInputCharacter));
this.state = State.AttributeName;
break;
}
default: {
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyNameAndValue());
this.reconsumeIn(State.AttributeName);
}
}
break;
}
case State.AttributeName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020':
case '\u002F':
case '\u003E':
case undefined: this.reconsumeIn(State.AfterAttributeName); break;
case '\u003D': this.state = State.BeforeAttributeValue; break;
case '\u0000': this.parseError('unexpected-null-character');
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToName();
break;
case '\u0022':
case '\u0027':
case '\u003C':
this.parseError('unexpected-character-in-attribute-name');
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter);
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter.toLowerCase());
break;
}
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter);
}
}
break;
}
case State.AfterAttributeName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
case '\u003D': this.state = State.BeforeAttributeValue; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break;
case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
default:
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyNameAndValue());
this.reconsumeIn(State.AttributeName);
break;
}
break;
}
case State.BeforeAttributeValue: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': break;
case '\u0022': this.state = State.AttributeValueDouble; break;
case '\u0027': this.state = State.AttributeValueSingle; break;
case '\u003E':
this.parseError('missing-attribute-value');
this.state = State.Data;
this.emitCurrentOfEitherType(StartTagToken, EndTagToken);
break;
default:
this.reconsumeIn(State.AttributeValueUnquoted);
}
break;
}
case State.AttributeValueDouble: {
switch (this.consumeNext()) {
case '\u0022': this.state = State.AfterAttributeValue; break;
case '\u0026': this.returnState = State.AttributeValueDouble; this.state = State.CharacterReference; break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue();
break;
case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
}
break;
}
case State.AttributeValueSingle: {
switch (this.consumeNext()) {
case '\u0027': this.state = State.AfterAttributeValue; break;
case '\u0026': this.returnState = State.AttributeValueSingle; this.state = State.CharacterReference; break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue();
break;
case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
}
break;
}
case State.AttributeValueUnquoted: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u0026': this.returnState = State.AttributeValueUnquoted; this.state = State.CharacterReference; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue();
break;
case '\u0022':
case '\u0027':
case '\u003C':
case '\u003D':
case '\u0060':
this.parseError('unexpected-character-in-unquoted-attribute-value');
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
break;
case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
}
break;
}
case State.AfterAttributeValue: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break;
case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
default: this.parseError('missing-whitespace-between-attributes'); this.reconsumeIn(State.BeforeAttributeName);
}
break;
}
case State.CommentStart: {
switch (this.consumeNext()) {
case '\u002D': this.state = State.CommentStartDash; break;
case '\u003E': this.parseError('abrupt-closing-of-empty-comment'); this.state = State.Data; this.emitCurrentOfType(CommentToken); break;
default: this.reconsumeIn(State.Comment);
}
break;
}
// FIXME: Possible improvement to https://html.spec.whatwg.org/multipage/parsing.html#comment-state (adding **current** in some places)
case State.Comment: {
switch (this.consumeNext()) {
case '\u003C': this.currentOfType(CommentToken).append(this.currentInputCharacter); this.state = State.CommentLessThanSign; break;
case '\u002D': this.state = State.CommentEndDash; break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(CommentToken).appendReplacementCharacter(); break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break;
default: this.currentOfType(CommentToken).append(this.currentInputCharacter);
}
break;
}
case State.CommentEndDash: {
switch (this.consumeNext()) {
case '\u002D': this.state = State.CommentEnd; break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break;
default: this.currentOfType(CommentToken).append('\u002D'); this.reconsumeIn(State.Comment);
}
break;
}
// Same as above fixme https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
case State.CommentEnd: {
switch (this.consumeNext()) {
case '\u003E': this.state = State.Data; this.emitCurrentOfType(CommentToken); break;
case '\u0021': this.state = State.CommentEndBang; break;
case '\u002D': this.currentOfType(CommentToken).append('\u002D'); break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break;
default: this.currentOfType(CommentToken).append('\u002D\u002D'); this.reconsumeIn(State.Comment);
}
break;
}
// Same as above https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
case State.BogusComment: {
switch (this.consumeNext()) {
case '\u003E': this.state = State.Data; this.emitCurrentOfType(CommentToken); break;
case undefined: this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(CommentToken).appendReplacementCharacter(); break;
default: this.currentOfType(CommentToken).append(this.currentInputCharacter);
}
break;
}
case State.CharacterReference: {
this.temporaryBuffer = '';
this.temporaryBuffer += '\u0026';
switch (this.consumeNext()) {
case '\u0023': this.temporaryBuffer += this.currentInputCharacter; this.state = State.NumericCharacterReference; break;
default: {
if (this.asciiAlphanumeric(this.currentInputCharacter)) {
this.reconsumeIn(State.NamedCharacterReference);
break;
}
this.flushCodePointsConsumedAsCharacterReference();
this.reconsumeIn(this.returnState);
}
}
break;
}
case State.NamedCharacterReference: {
let match = false;
for (const entry in entities) {
if (this.matchNextFew(entry)) {
match = true;
this.consumeNextFew(entry);
this.temporaryBuffer += entry;
if (this.consumedAsPartOfAnAttribute() && entry[entry.length - 1] !== '\u003B' && (this.next() === '\u003D' || this.asciiAlphanumeric(this.next() ?? ''))) {
this.flushCodePointsConsumedAsCharacterReference();
this.state = this.returnState;
break;
}
if (entry[entry.length - 1] !== '\u003B')
this.parseError('missing-semicolon-after-character-reference');
this.temporaryBuffer = '';
this.temporaryBuffer += entities[entry].characters;
this.flushCodePointsConsumedAsCharacterReference();
this.state = this.returnState;
break;
}
}
if (!match) {
this.flushCodePointsConsumedAsCharacterReference();
this.state = State.AmbiguousAmpersand;
}
break;
}
case State.AmbiguousAmpersand: {
switch (this.consumeNext()) {
case '\u003B': this.parseError('unknown-named-character-reference'); this.reconsumeIn(this.returnState); break;
default: {
if (this.asciiAlphanumeric(this.currentInputCharacter)) {
if (this.consumedAsPartOfAnAttribute()) {
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
} else {
this.emit(CharacterToken.createWith(this.currentInputCharacter));
}
break;
}
this.reconsumeIn(this.returnState);
}
}
break;
}
default: TODO(`Unimplemented state '${this.state}'`);
}
}
private flushCodePointsConsumedAsCharacterReference(): void {
if (this.consumedAsPartOfAnAttribute()) {
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.temporaryBuffer);
return;
}
for (const codePoint of this.temporaryBuffer)
this.emit(CharacterToken.createWith(codePoint));
}
private consumedAsPartOfAnAttribute(): boolean {
return this.returnState === State.AttributeValueDouble || this.returnState === State.AttributeValueSingle || this.returnState === State.AttributeValueUnquoted;
}
private asciiAlphanumeric(input: string): boolean {
return this.asciiAlpha(input) || this.asciiDigit(input);
}
private asciiAlpha(input: string): boolean {
return this.asciiUpperAlpha(input) || this.asciiLowerAlpha(input);
}
private asciiUpperAlpha(input: string): boolean {
return /[^\u0041-\u005A]/.test(input);
}
private asciiLowerAlpha(input: string): boolean {
return /[^\u0061-\u007A]/.test(input);
}
private asciiDigit(input: string): boolean {
return /[^\u0030-\u0030]/.test(input);
}
private reconsumeIn(state: State): void {
this.pointer--;
this.state = state;
this.spin();
}
private parseError(error: ParseError): void {
console.error('Parse error: ' + error);
}
private consumeNext(): string | undefined {
this.currentInputCharacter = this.input[this.pointer];
this.pointer++;
this.currentPosition.column++;
this.currentPosition.index++;
if (this.currentInputCharacter === '\n') {
this.currentPosition.column = 0;
this.currentPosition.line++;
}
return this.currentInputCharacter;
}
private next(): string | undefined {
return this.input[this.pointer];
}
private matchNextFew(input: string): boolean {
return this.input.substr(this.pointer, input.length) === input;
}
private matchNextFewCaseInsensitive(input: string): boolean {
return this.input.substr(this.pointer, input.length).toLowerCase() === input.toLowerCase();
}
private consumeNextFew(input: string): void {
for (let i = 0; i < input.length; i++) {
const consumed = this.consumeNext();
VERIFY(consumed === input[i], `Expected '${input[i]}' (${input} at ${i}), got ${consumed} instead`);
}
}
private consumeNextFewCaseInsensitive(input: string): void {
for (let i = 0; i < input.length; i++) {
const consumed = this.consumeNext()?.toLowerCase();
VERIFY(consumed === input[i].toLowerCase(), `Expected '${input[i].toLowerCase()}' (${input.toLowerCase()} at ${i}), got ${consumed} instead`);
}
}
private emit(token: Token): void {
this.populateRangeOnEmit(token);
this.tokens.push(token);
}
private emitCurrentOfType(type: Constructor<Token>): void {
VERIFY(this.currentToken instanceof type, `Expected '${type.name}', got '${this.currentToken.constructor.name}' instead`);
this.populateRangeOnEmit(this.currentToken);
this.tokens.push(this.currentToken);
}
private emitCurrentOfEitherType<T extends Token, U extends Token>(a: Constructor<T>, b: Constructor<U>): void {
VERIFY(this.currentToken instanceof a || this.currentToken instanceof b, `Expected '${a.name}' or '${b.name}', got '${this.currentToken.constructor.name}' instead`);
this.populateRangeOnEmit(this.currentToken);
this.tokens.push(this.currentToken);
}
private currentOfType<T extends Token>(type: Constructor<T>): T {
VERIFY(this.currentToken instanceof type, `Expected '${type.name}', got '${this.currentToken.constructor.name}' instead`);
this.populateRangeOnEmit(this.currentToken);
return this.currentToken;
}
private currentOfEitherType<T extends Token, U extends Token>(a: Constructor<T>, b: Constructor<U>): T | U {
VERIFY(this.currentToken instanceof a || this.currentToken instanceof b, `Expected '${a.name}' or '${b.name}', got '${this.currentToken.constructor.name}' instead`);
this.populateRangeOnEmit(this.currentToken);
return this.currentToken;
}
private populateRangeOnEmit(token: Token): void {
if (token.range.start === undefined && token.range.end === undefined)
token.at(this.currentPosition);
if (token.range.start !== undefined && token.range.end === undefined)
token.endingAt(this.currentPosition);
if (token.range.start === undefined && token.range.end !== undefined)
VERIFY_NOT_REACHED();
}
private create(token: Token): Token {
if (token.range.start === undefined)
token.startingAt(this.currentPosition);
return this.currentToken = token;
}
}