nwex.de/html/tokenizer.ts

640 lines
28 KiB
TypeScript
Raw Normal View History

import { ParseError } from "./errors.js";
import { entities } from "./tokenizer/entities.js";
import { State } from "./tokenizer/state.js";
import { AttributeList, Token, Type } from "./tokenizer/token.js";
// FIXME: Replace console.assert calls will throwing errors
export class Tokenizer {
private state: State = State.Data;
private returnState!: State;
private temporaryBuffer!: string;
private currentToken!: Token;
private currentInputCharacter!: string;
public tokens: Array<Token> = new Array<Token>();
private pointer: number = 0;
public constructor(private input: string) {
}
public spin(): void {
switch (this.state) {
case State.Data: {
switch (this.consumeNext()) {
case '\u0026':
this.returnState = State.Data;
this.state = State.CharacterReference;
break;
case '\u003C': this.state = State.TagOpen; break;
case '\u0000':
this.parseError('unexpected-null-character');
this.emit({ type: Type.Character, data: this.currentInputCharacter });
break;
case undefined: this.emit({ type: Type.EndOfFile }); break;
default: this.emit({ type: Type.Character, data: this.currentInputCharacter });
}
break;
}
case State.RCDATA: {
switch (this.consumeNext()) {
case '\u003C': this.state = State.RAWTEXTLessThan; break;
case '\u0000': this.parseError('unexpected-null-character'); this.emit({ type: Type.Character, data: '\uFFFD' }); break;
case undefined: this.emit({ type: Type.EndOfFile }); break;
default: this.emit({ type: Type.Character, data: this.currentInputCharacter });
}
break;
}
case State.TagOpen: {
switch (this.consumeNext()) {
case '\u0021': this.state = State.MarkupDeclarationOpen; break;
case '\u002F': this.state = State.EndTagOpen; break;
case '\u003F':
this.parseError('unexpected-question-mark-instead-of-tag-name');
this.create({ type: Type.Comment, data: '' });
this.reconsumeIn(State.BogusComment);
break;
case undefined:
this.parseError('eof-before-tag-name');
this.emit({ type: Type.Character, data: '\u003C' });
this.emit({ type: Type.EndOfFile });
break;
default: {
if (this.asciiAlpha(this.currentInputCharacter)) {
this.create({ type: Type.StartTag, name: '', attributes: new AttributeList() });
this.reconsumeIn(State.TagName);
break;
}
this.parseError('invalid-first-character-of-tag-name');
this.emit({ type: Type.Character, data: '\u003C' });
this.reconsumeIn(State.Data);
}
}
break;
}
case State.EndTagOpen: {
switch (this.consumeNext()) {
case '\u003E': this.parseError('missing-end-tag-name'); this.state = State.Data; break;
case undefined:
this.parseError('eof-before-tag-name');
this.emit({ type: Type.Character, data: '\u003C' });
this.emit({ type: Type.Character, data: '\u002F' });
this.emit({ type: Type.EndOfFile });
break;
default: {
if (this.asciiAlpha(this.currentInputCharacter)) {
this.create({ type: Type.EndTag, name: '', attributes: new AttributeList() });
this.reconsumeIn(State.TagName);
break;
}
this.parseError('invalid-first-character-of-tag-name');
this.create({ type: Type.Comment, data: '' });
this.reconsumeIn(State.BogusComment);
}
}
break;
}
case State.MarkupDeclarationOpen: {
if (this.matchNextFew('--')) {
this.consumeNextFew('--');
this.create({ type: Type.Comment, data: '' });
this.state = State.CommentStart;
} else if (this.matchNextFewCaseInsensitive('DOCTYPE')) {
this.consumeNextFewCaseInsensitive('DOCTYPE');
this.state = State.DOCTYPE;
} else if (this.matchNextFew('[CDATA[')) {
this.consumeNextFew('[CDATA[');
// NOTE: This parser will never be generated as part of the fragment parsing algorithm, as such the CDATA section state does not
// exist and will not be started here.
this.parseError('cdata-in-html-content');
this.create({ type: Type.Comment, data: '[CDATA[' });
this.state = State.BogusComment;
} else {
this.parseError('incorrectly-opened-comment');
this.create({ type: Type.Comment, data: '' });
this.state = State.BogusComment;
}
break;
}
case State.DOCTYPE: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.BeforeDOCTYPEName; break;
case '\u003E': this.reconsumeIn(State.BeforeDOCTYPEName); break;
case undefined:
this.parseError('eof-in-doctype');
this.emit({ type: Type.DOCTYPE, forceQuirks: true });
this.emit({ type: Type.EndOfFile });
break;
default:
this.parseError('missing-whitespace-before-doctype-name');
this.reconsumeIn(State.BeforeDOCTYPEName);
}
break;
}
case State.BeforeDOCTYPEName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': break;
case '\u0000':
this.parseError('unexpected-null-character');
this.create({ type: Type.DOCTYPE, name: '\uFFFD' });
this.state = State.DOCTYPEName;
break;
case undefined:
this.parseError('eof-in-doctype');
this.emit({ type: Type.DOCTYPE, forceQuirks: true });
this.emit({ type: Type.EndOfFile });
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter.toLowerCase()});
this.state = State.DOCTYPEName;
break;
}
this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter });
this.state = State.DOCTYPE;
}
}
break;
}
case State.DOCTYPEName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.AfterDOCTYPEName; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.DOCTYPE); break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.DOCTYPE)!.name += '\uFFFD'; break;
case undefined:
this.parseError('eof-in-doctype');
this.currentOfType(Type.DOCTYPE).forceQuirks = true;
this.emitCurrentOfType(Type.DOCTYPE);
this.emit({ type: Type.EndOfFile });
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter.toLowerCase();
break;
}
this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter;
}
}
break;
}
case State.TagName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).name += '\uFFFD';
break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter.toLowerCase();
break;
}
this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter;
}
}
break;
}
case State.BeforeAttributeName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': break;
case '\u002F':
case '\u003E':
case undefined: this.reconsumeIn(State.AfterAttributeName); break;
case '\u003D': {
this.parseError('unexpected-equals-sign-before-attribute-name');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: this.currentInputCharacter, value: '' });
this.state = State.AttributeName;
break;
}
default: {
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' });
this.reconsumeIn(State.AttributeName);
}
}
break;
}
case State.AttributeName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020':
case '\u002F':
case '\u003E':
case undefined: this.reconsumeIn(State.AfterAttributeName); break;
case '\u003D': this.state = State.BeforeAttributeValue; break;
case '\u0000': this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += '\uFFFD';
break;
case '\u0022':
case '\u0027':
case '\u003C':
this.parseError('unexpected-character-in-attribute-name');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter;
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter.toLowerCase();
break;
}
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter;
}
}
break;
}
case State.AfterAttributeName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
case '\u003D': this.state = State.BeforeAttributeValue; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default:
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' });
this.reconsumeIn(State.AttributeName);
break;
}
break;
}
case State.BeforeAttributeValue: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': break;
case '\u0022': this.state = State.AttributeValueDouble; break;
case '\u0027': this.state = State.AttributeValueSingle; break;
case '\u003E':
this.parseError('missing-attribute-value');
this.state = State.Data;
this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag);
break;
default:
this.reconsumeIn(State.AttributeValueUnquoted);
}
break;
}
case State.AttributeValueDouble: {
switch (this.consumeNext()) {
case '\u0022': this.state = State.AfterAttributeValue; break;
case '\u0026': this.returnState = State.AttributeValueDouble; this.state = State.CharacterReference; break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
}
break;
}
case State.AttributeValueSingle: {
switch (this.consumeNext()) {
case '\u0027': this.state = State.AfterAttributeValue; break;
case '\u0026': this.returnState = State.AttributeValueSingle; this.state = State.CharacterReference; break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
}
break;
}
case State.AttributeValueUnquoted: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u0026': this.returnState = State.AttributeValueUnquoted; this.state = State.CharacterReference; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
break;
case '\u0022':
case '\u0027':
case '\u003C':
case '\u003D':
case '\u0060':
this.parseError('unexpected-character-in-unquoted-attribute-value');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
}
break;
}
case State.AfterAttributeValue: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: this.parseError('missing-whitespace-between-attributes'); this.reconsumeIn(State.BeforeAttributeName);
}
break;
}
case State.CommentStart: {
switch (this.consumeNext()) {
case '\u002D': this.state = State.CommentStartDash; break;
case '\u003E': this.parseError('abrupt-closing-of-empty-comment'); this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
default: this.reconsumeIn(State.Comment);
}
break;
}
// FIXME: Possible improvement to https://html.spec.whatwg.org/multipage/parsing.html#comment-state (adding **current** in some places)
case State.Comment: {
switch (this.consumeNext()) {
case '\u003C': this.currentOfType(Type.Comment).data += this.currentInputCharacter; this.state = State.CommentLessThanSign; break;
case '\u002D': this.state = State.CommentEndDash; break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfType(Type.Comment).data += this.currentInputCharacter;
}
break;
}
case State.CommentEndDash: {
switch (this.consumeNext()) {
case '\u002D': this.state = State.CommentEnd; break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfType(Type.Comment).data += '\u002D'; this.reconsumeIn(State.Comment);
}
break;
}
// Same as above fixme https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
case State.CommentEnd: {
switch (this.consumeNext()) {
case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
case '\u0021': this.state = State.CommentEndBang; break;
case '\u002D': this.currentOfType(Type.Comment).data += '\u002D'; break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfType(Type.Comment).data += '\u002D\u002D'; this.reconsumeIn(State.Comment);
}
break;
}
// Same as above https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
case State.BogusComment: {
switch (this.consumeNext()) {
case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
case undefined: this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break;
default: this.currentOfType(Type.Comment).data += this.currentInputCharacter;
}
break;
}
case State.CharacterReference: {
this.temporaryBuffer = '';
this.temporaryBuffer += '\u0026';
switch (this.consumeNext()) {
case '\u0023': this.temporaryBuffer += this.currentInputCharacter; this.state = State.NumericCharacterReference; break;
default: {
if (this.asciiAlphanumeric(this.currentInputCharacter)) {
this.reconsumeIn(State.NamedCharacterReference);
break;
}
this.flushCodePointsConsumedAsCharacterReference();
this.reconsumeIn(this.returnState);
}
}
break;
}
case State.NamedCharacterReference: {
let match = false;
for (const entry in entities) {
if (this.matchNextFew(entry)) {
match = true;
this.consumeNextFew(entry);
this.temporaryBuffer += entry;
if (this.consumedAsPartOfAnAttribute() && entry[entry.length - 1] !== '\u003B' && (this.next() === '\u003D' || this.asciiAlphanumeric(this.next() ?? ''))) {
this.flushCodePointsConsumedAsCharacterReference();
this.state = this.returnState;
break;
}
if (entry[entry.length - 1] !== '\u003B')
this.parseError('missing-semicolon-after-character-reference');
this.temporaryBuffer = '';
this.temporaryBuffer += entities[entry].characters;
this.flushCodePointsConsumedAsCharacterReference();
this.state = this.returnState;
break;
}
}
if (!match) {
this.flushCodePointsConsumedAsCharacterReference();
this.state = State.AmbiguousAmpersand;
}
break;
}
case State.AmbiguousAmpersand: {
switch (this.consumeNext()) {
case '\u003B': this.parseError('unknown-named-character-reference'); this.reconsumeIn(this.returnState); break;
default: {
if (this.asciiAlphanumeric(this.currentInputCharacter)) {
if (this.consumedAsPartOfAnAttribute()) {
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
} else {
this.emit({ type: Type.Character, data: this.currentInputCharacter });
}
break;
}
this.reconsumeIn(this.returnState);
}
}
break;
}
default: throw new Error(`FIXME (Tokenizer#iterate, Unimplemented state '${this.state}')`);
}
}
private flushCodePointsConsumedAsCharacterReference(): void {
if (this.consumedAsPartOfAnAttribute()) {
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.temporaryBuffer;
return;
}
for (const codePoint of this.temporaryBuffer)
this.emit({ type: Type.Character, data: codePoint });
}
private consumedAsPartOfAnAttribute(): boolean {
return this.returnState === State.AttributeValueDouble || this.returnState === State.AttributeValueSingle || this.returnState === State.AttributeValueUnquoted;
}
private asciiAlphanumeric(input: string): boolean {
return this.asciiAlpha(input) || this.asciiDigit(input);
}
private asciiAlpha(input: string): boolean {
return this.asciiUpperAlpha(input) || this.asciiLowerAlpha(input);
}
private asciiUpperAlpha(input: string): boolean {
return /[^\u0041-\u005A]/.test(input);
}
private asciiLowerAlpha(input: string): boolean {
return /[^\u0061-\u007A]/.test(input);
}
private asciiDigit(input: string): boolean {
return /[^\u0030-\u0030]/.test(input);
}
private reconsumeIn(state: State): void {
this.pointer--;
this.state = state;
this.spin();
}
private parseError(error: ParseError): void {
console.error('Parse error: ' + error);
}
private consumeNext(): string | undefined {
this.currentInputCharacter = this.input[this.pointer];
this.pointer++;
return this.currentInputCharacter;
}
private next(): string | undefined {
return this.input[this.pointer];
}
private matchNextFew(input: string): boolean {
return this.input.substr(this.pointer, input.length) === input;
}
private matchNextFewCaseInsensitive(input: string): boolean {
return this.input.substr(this.pointer, input.length).toLowerCase() === input.toLowerCase();
}
private consumeNextFew(input: string): void {
for (let i = 0; i < input.length; i++) {
const consumed = this.consumeNext();
console.assert(consumed === input[i], {
message: `Tokenizer#consumeNextFew: Expected '${input[i]}' (${input} at ${i}), got ${consumed} instead`
});
}
}
private consumeNextFewCaseInsensitive(input: string): void {
for (let i = 0; i < input.length; i++) {
const consumed = this.consumeNext()?.toLowerCase();
console.assert(consumed === input[i].toLowerCase(), {
message: `Tokenizer#consumeNextFewCaseInsensitive: Expected '${input[i].toLowerCase()}' (${input.toLowerCase()} at ${i}), got ${consumed} instead`
});
}
}
private emit(token: Token): void {
this.tokens.push(token);
}
private emitCurrentOfType(type: Type): void {
console.assert(this.currentToken.type === type, {
message: `Tokenizer#emitCurrentOfType: Expected '${type}', got '${this.currentToken.type}' instead`
});
this.tokens.push(this.currentToken);
}
private emitCurrentOfEitherType(a: Type, b: Type): void {
console.assert(this.currentToken.type === a || this.currentToken.type === b, {
message: `Tokenizer#emitCurrentOfEitherType: Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`
});
this.tokens.push(this.currentToken);
}
private currentOfType<T extends Type>(type: T): Token & { type: T } {
console.assert(this.currentToken.type === type, {
message: `Tokenizer#currentOfType: Expected '${type}', got '${this.currentToken.type}' instead`
});
return this.currentToken as Token & { type: T };
}
private currentOfEitherType<T extends Type, U extends Type>(a: T, b: U): Token & { type: T | U } {
console.assert(this.currentToken.type === a || this.currentToken.type === b, {
message: `Tokenizer#currentOfEitherType: Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`
});
return this.currentToken as Token & { type: T };
}
private create(token: Token): Token {
return this.currentToken = token;
}
}