nwex.de/html/tokenizer.ts
networkException 586546ee57 Everywhere: Rework website concept completely
This patch removes the Next.js React project that was contained
by this repository previously. The replacement is a vanilla HTML
page with TypeScript that parses it's own HTML source and highlights
it using on load.

The concept will be iterated on in following commits, planned are
on hover tooltips showing metadata about HTML tokens as well as
tokenizing (perhaps parsing) of JavaScript and CSS to be able to
highlight those sections as well. To properly determent the range
of script and style sections it might be required to also implement
HTML tree building, however on read execution of JavaScript or
on the fly parsing as well as fragment parsing is not required for
the site.

This commit merely represents a start and is made to better track
the progress of changes.
2021-10-24 22:36:38 +02:00

639 lines
28 KiB
TypeScript

import { ParseError } from "./errors.js";
import { entities } from "./tokenizer/entities.js";
import { State } from "./tokenizer/state.js";
import { AttributeList, Token, Type } from "./tokenizer/token.js";
// FIXME: Replace console.assert calls will throwing errors
export class Tokenizer {
private state: State = State.Data;
private returnState!: State;
private temporaryBuffer!: string;
private currentToken!: Token;
private currentInputCharacter!: string;
public tokens: Array<Token> = new Array<Token>();
private pointer: number = 0;
public constructor(private input: string) {
}
public spin(): void {
switch (this.state) {
case State.Data: {
switch (this.consumeNext()) {
case '\u0026':
this.returnState = State.Data;
this.state = State.CharacterReference;
break;
case '\u003C': this.state = State.TagOpen; break;
case '\u0000':
this.parseError('unexpected-null-character');
this.emit({ type: Type.Character, data: this.currentInputCharacter });
break;
case undefined: this.emit({ type: Type.EndOfFile }); break;
default: this.emit({ type: Type.Character, data: this.currentInputCharacter });
}
break;
}
case State.RCDATA: {
switch (this.consumeNext()) {
case '\u003C': this.state = State.RAWTEXTLessThan; break;
case '\u0000': this.parseError('unexpected-null-character'); this.emit({ type: Type.Character, data: '\uFFFD' }); break;
case undefined: this.emit({ type: Type.EndOfFile }); break;
default: this.emit({ type: Type.Character, data: this.currentInputCharacter });
}
break;
}
case State.TagOpen: {
switch (this.consumeNext()) {
case '\u0021': this.state = State.MarkupDeclarationOpen; break;
case '\u002F': this.state = State.EndTagOpen; break;
case '\u003F':
this.parseError('unexpected-question-mark-instead-of-tag-name');
this.create({ type: Type.Comment, data: '' });
this.reconsumeIn(State.BogusComment);
break;
case undefined:
this.parseError('eof-before-tag-name');
this.emit({ type: Type.Character, data: '\u003C' });
this.emit({ type: Type.EndOfFile });
break;
default: {
if (this.asciiAlpha(this.currentInputCharacter)) {
this.create({ type: Type.StartTag, name: '', attributes: new AttributeList() });
this.reconsumeIn(State.TagName);
break;
}
this.parseError('invalid-first-character-of-tag-name');
this.emit({ type: Type.Character, data: '\u003C' });
this.reconsumeIn(State.Data);
}
}
break;
}
case State.EndTagOpen: {
switch (this.consumeNext()) {
case '\u003E': this.parseError('missing-end-tag-name'); this.state = State.Data; break;
case undefined:
this.parseError('eof-before-tag-name');
this.emit({ type: Type.Character, data: '\u003C' });
this.emit({ type: Type.Character, data: '\u002F' });
this.emit({ type: Type.EndOfFile });
break;
default: {
if (this.asciiAlpha(this.currentInputCharacter)) {
this.create({ type: Type.EndTag, name: '', attributes: new AttributeList() });
this.reconsumeIn(State.TagName);
break;
}
this.parseError('invalid-first-character-of-tag-name');
this.create({ type: Type.Comment, data: '' });
this.reconsumeIn(State.BogusComment);
}
}
break;
}
case State.MarkupDeclarationOpen: {
if (this.matchNextFew('--')) {
this.consumeNextFew('--');
this.create({ type: Type.Comment, data: '' });
this.state = State.CommentStart;
} else if (this.matchNextFewCaseInsensitive('DOCTYPE')) {
this.consumeNextFewCaseInsensitive('DOCTYPE');
this.state = State.DOCTYPE;
} else if (this.matchNextFew('[CDATA[')) {
this.consumeNextFew('[CDATA[');
// NOTE: This parser will never be generated as part of the fragment parsing algorithm, as such the CDATA section state does not
// exist and will not be started here.
this.parseError('cdata-in-html-content');
this.create({ type: Type.Comment, data: '[CDATA[' });
this.state = State.BogusComment;
} else {
this.parseError('incorrectly-opened-comment');
this.create({ type: Type.Comment, data: '' });
this.state = State.BogusComment;
}
break;
}
case State.DOCTYPE: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.BeforeDOCTYPEName; break;
case '\u003E': this.reconsumeIn(State.BeforeDOCTYPEName); break;
case undefined:
this.parseError('eof-in-doctype');
this.emit({ type: Type.DOCTYPE, forceQuirks: true });
this.emit({ type: Type.EndOfFile });
break;
default:
this.parseError('missing-whitespace-before-doctype-name');
this.reconsumeIn(State.BeforeDOCTYPEName);
}
break;
}
case State.BeforeDOCTYPEName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': break;
case '\u0000':
this.parseError('unexpected-null-character');
this.create({ type: Type.DOCTYPE, name: '\uFFFD' });
this.state = State.DOCTYPEName;
break;
case undefined:
this.parseError('eof-in-doctype');
this.emit({ type: Type.DOCTYPE, forceQuirks: true });
this.emit({ type: Type.EndOfFile });
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter.toLowerCase()});
this.state = State.DOCTYPEName;
break;
}
this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter });
this.state = State.DOCTYPE;
}
}
break;
}
case State.DOCTYPEName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.AfterDOCTYPEName; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.DOCTYPE); break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.DOCTYPE)!.name += '\uFFFD'; break;
case undefined:
this.parseError('eof-in-doctype');
this.currentOfType(Type.DOCTYPE).forceQuirks = true;
this.emitCurrentOfType(Type.DOCTYPE);
this.emit({ type: Type.EndOfFile });
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter.toLowerCase();
break;
}
this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter;
}
}
break;
}
case State.TagName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).name += '\uFFFD';
break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter.toLowerCase();
break;
}
this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter;
}
}
break;
}
case State.BeforeAttributeName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': break;
case '\u002F':
case '\u003E':
case undefined: this.reconsumeIn(State.AfterAttributeName); break;
case '\u003D': {
this.parseError('unexpected-equals-sign-before-attribute-name');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: this.currentInputCharacter, value: '' });
this.state = State.AttributeName;
break;
}
default: {
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' });
this.reconsumeIn(State.AttributeName);
}
}
break;
}
case State.AttributeName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020':
case '\u002F':
case '\u003E':
case undefined: this.reconsumeIn(State.AfterAttributeName); break;
case '\u003D': this.state = State.BeforeAttributeValue; break;
case '\u0000': this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += '\uFFFD';
break;
case '\u0022':
case '\u0027':
case '\u003C':
this.parseError('unexpected-character-in-attribute-name');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter;
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter.toLowerCase();
break;
}
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter;
}
}
break;
}
case State.AfterAttributeName: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
case '\u003D': this.state = State.BeforeAttributeValue; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default:
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' });
this.reconsumeIn(State.AttributeName);
break;
}
break;
}
case State.BeforeAttributeValue: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': break;
case '\u0022': this.state = State.AttributeValueDouble; break;
case '\u0027': this.state = State.AttributeValueSingle; break;
case '\u003E':
this.parseError('missing-attribute-value');
this.state = State.Data;
this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag);
break;
default:
this.reconsumeIn(State.AttributeValueUnquoted);
}
break;
}
case State.AttributeValueDouble: {
switch (this.consumeNext()) {
case '\u0022': this.state = State.AfterAttributeValue; break;
case '\u0026': this.returnState = State.AttributeValueDouble; this.state = State.CharacterReference; break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
}
break;
}
case State.AttributeValueSingle: {
switch (this.consumeNext()) {
case '\u0027': this.state = State.AfterAttributeValue; break;
case '\u0026': this.returnState = State.AttributeValueSingle; this.state = State.CharacterReference; break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
}
break;
}
case State.AttributeValueUnquoted: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u0026': this.returnState = State.AttributeValueUnquoted; this.state = State.CharacterReference; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
break;
case '\u0022':
case '\u0027':
case '\u003C':
case '\u003D':
case '\u0060':
this.parseError('unexpected-character-in-unquoted-attribute-value');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
}
break;
}
case State.AfterAttributeValue: {
switch (this.consumeNext()) {
case '\u0009':
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: this.parseError('missing-whitespace-between-attributes'); this.reconsumeIn(State.BeforeAttributeName);
}
break;
}
case State.CommentStart: {
switch (this.consumeNext()) {
case '\u002D': this.state = State.CommentStartDash; break;
case '\u003E': this.parseError('abrupt-closing-of-empty-comment'); this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
default: this.reconsumeIn(State.Comment);
}
break;
}
// FIXME: Possible improvement to https://html.spec.whatwg.org/multipage/parsing.html#comment-state (adding **current** in some places)
case State.Comment: {
switch (this.consumeNext()) {
case '\u003C': this.currentOfType(Type.Comment).data += this.currentInputCharacter; this.state = State.CommentLessThanSign; break;
case '\u002D': this.state = State.CommentEndDash; break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfType(Type.Comment).data += this.currentInputCharacter;
}
break;
}
case State.CommentEndDash: {
switch (this.consumeNext()) {
case '\u002D': this.state = State.CommentEnd; break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfType(Type.Comment).data += '\u002D'; this.reconsumeIn(State.Comment);
}
break;
}
// Same as above fixme https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
case State.CommentEnd: {
switch (this.consumeNext()) {
case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
case '\u0021': this.state = State.CommentEndBang; break;
case '\u002D': this.currentOfType(Type.Comment).data += '\u002D'; break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfType(Type.Comment).data += '\u002D\u002D'; this.reconsumeIn(State.Comment);
}
break;
}
// Same as above https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
case State.BogusComment: {
switch (this.consumeNext()) {
case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
case undefined: this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break;
default: this.currentOfType(Type.Comment).data += this.currentInputCharacter;
}
break;
}
case State.CharacterReference: {
this.temporaryBuffer = '';
this.temporaryBuffer += '\u0026';
switch (this.consumeNext()) {
case '\u0023': this.temporaryBuffer += this.currentInputCharacter; this.state = State.NumericCharacterReference; break;
default: {
if (this.asciiAlphanumeric(this.currentInputCharacter)) {
this.reconsumeIn(State.NamedCharacterReference);
break;
}
this.flushCodePointsConsumedAsCharacterReference();
this.reconsumeIn(this.returnState);
}
}
break;
}
case State.NamedCharacterReference: {
let match = false;
for (const entry in entities) {
if (this.matchNextFew(entry)) {
match = true;
this.consumeNextFew(entry);
this.temporaryBuffer += entry;
if (this.consumedAsPartOfAnAttribute() && entry[entry.length - 1] !== '\u003B' && (this.next() === '\u003D' || this.asciiAlphanumeric(this.next() ?? ''))) {
this.flushCodePointsConsumedAsCharacterReference();
this.state = this.returnState;
break;
}
if (entry[entry.length - 1] !== '\u003B')
this.parseError('missing-semicolon-after-character-reference');
this.temporaryBuffer = '';
this.temporaryBuffer += entities[entry].characters;
this.flushCodePointsConsumedAsCharacterReference();
this.state = this.returnState;
break;
}
}
if (!match) {
this.flushCodePointsConsumedAsCharacterReference();
this.state = State.AmbiguousAmpersand;
}
break;
}
case State.AmbiguousAmpersand: {
switch (this.consumeNext()) {
case '\u003B': this.parseError('unknown-named-character-reference'); this.reconsumeIn(this.returnState); break;
default: {
if (this.asciiAlphanumeric(this.currentInputCharacter)) {
if (this.consumedAsPartOfAnAttribute()) {
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
} else {
this.emit({ type: Type.Character, data: this.currentInputCharacter });
}
break;
}
this.reconsumeIn(this.returnState);
}
}
break;
}
default: throw new Error(`FIXME (Tokenizer#iterate, Unimplemented state '${this.state}')`);
}
}
private flushCodePointsConsumedAsCharacterReference(): void {
if (this.consumedAsPartOfAnAttribute()) {
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.temporaryBuffer;
return;
}
for (const codePoint of this.temporaryBuffer)
this.emit({ type: Type.Character, data: codePoint });
}
private consumedAsPartOfAnAttribute(): boolean {
return this.returnState === State.AttributeValueDouble || this.returnState === State.AttributeValueSingle || this.returnState === State.AttributeValueUnquoted;
}
private asciiAlphanumeric(input: string): boolean {
return this.asciiAlpha(input) || this.asciiDigit(input);
}
private asciiAlpha(input: string): boolean {
return this.asciiUpperAlpha(input) || this.asciiLowerAlpha(input);
}
private asciiUpperAlpha(input: string): boolean {
return /[^\u0041-\u005A]/.test(input);
}
private asciiLowerAlpha(input: string): boolean {
return /[^\u0061-\u007A]/.test(input);
}
private asciiDigit(input: string): boolean {
return /[^\u0030-\u0030]/.test(input);
}
private reconsumeIn(state: State): void {
this.pointer--;
this.state = state;
this.spin();
}
private parseError(error: ParseError): void {
console.error('Parse error: ' + error);
}
private consumeNext(): string | undefined {
this.currentInputCharacter = this.input[this.pointer];
this.pointer++;
return this.currentInputCharacter;
}
private next(): string | undefined {
return this.input[this.pointer];
}
private matchNextFew(input: string): boolean {
return this.input.substr(this.pointer, input.length) === input;
}
private matchNextFewCaseInsensitive(input: string): boolean {
return this.input.substr(this.pointer, input.length).toLowerCase() === input.toLowerCase();
}
private consumeNextFew(input: string): void {
for (let i = 0; i < input.length; i++) {
const consumed = this.consumeNext();
console.assert(consumed === input[i], {
message: `Tokenizer#consumeNextFew: Expected '${input[i]}' (${input} at ${i}), got ${consumed} instead`
});
}
}
private consumeNextFewCaseInsensitive(input: string): void {
for (let i = 0; i < input.length; i++) {
const consumed = this.consumeNext()?.toLowerCase();
console.assert(consumed === input[i].toLowerCase(), {
message: `Tokenizer#consumeNextFewCaseInsensitive: Expected '${input[i].toLowerCase()}' (${input.toLowerCase()} at ${i}), got ${consumed} instead`
});
}
}
private emit(token: Token): void {
this.tokens.push(token);
}
private emitCurrentOfType(type: Type): void {
console.assert(this.currentToken.type === type, {
message: `Tokenizer#emitCurrentOfType: Expected '${type}', got '${this.currentToken.type}' instead`
});
this.tokens.push(this.currentToken);
}
private emitCurrentOfEitherType(a: Type, b: Type): void {
console.assert(this.currentToken.type === a || this.currentToken.type === b, {
message: `Tokenizer#emitCurrentOfEitherType: Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`
});
this.tokens.push(this.currentToken);
}
private currentOfType<T extends Type>(type: T): Token & { type: T } {
console.assert(this.currentToken.type === type, {
message: `Tokenizer#currentOfType: Expected '${type}', got '${this.currentToken.type}' instead`
});
return this.currentToken as Token & { type: T };
}
private currentOfEitherType<T extends Type, U extends Type>(a: T, b: U): Token & { type: T | U } {
console.assert(this.currentToken.type === a || this.currentToken.type === b, {
message: `Tokenizer#currentOfEitherType: Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`
});
return this.currentToken as Token & { type: T };
}
private create(token: Token): Token {
return this.currentToken = token;
}
}