nwex.de/html/tokenizer.ts

import { ParseError } from "./errors.js";
import { entities } from "./tokenizer/entities.js";
import { State } from "./tokenizer/state.js";
import { AttributeList, Token, Type } from "./tokenizer/token.js";

// FIXME: Replace console.assert calls will throwing errors
export class Tokenizer {
    private state: State = State.Data;
    private returnState!: State;

    private temporaryBuffer!: string;

    private currentToken!: Token;
    private currentInputCharacter!: string;

    public tokens: Array<Token> = new Array<Token>();
    private pointer: number = 0;

    public constructor(private input: string) {
    }

    public spin(): void {
        switch (this.state) {
        case State.Data: {
            switch (this.consumeNext()) {
                case '\u0026':
                    this.returnState = State.Data;
                    this.state = State.CharacterReference;
                    break;
                case '\u003C': this.state = State.TagOpen; break;
                case '\u0000':
                    this.parseError('unexpected-null-character');
                    this.emit({ type: Type.Character, data: this.currentInputCharacter });
                    break;
                case undefined: this.emit({ type: Type.EndOfFile }); break;
                default: this.emit({ type: Type.Character, data: this.currentInputCharacter });
            }

            break;
        }
        case State.RCDATA: {
            switch (this.consumeNext()) {
                case '\u003C': this.state = State.RAWTEXTLessThan; break;
                case '\u0000': this.parseError('unexpected-null-character'); this.emit({ type: Type.Character, data: '\uFFFD' }); break;
                case undefined: this.emit({ type: Type.EndOfFile }); break;
                default: this.emit({ type: Type.Character, data: this.currentInputCharacter });
            }

            break;
        }
        case State.TagOpen: {
            switch (this.consumeNext()) {
                case '\u0021': this.state = State.MarkupDeclarationOpen; break;
                case '\u002F': this.state = State.EndTagOpen; break;
                case '\u003F':
                    this.parseError('unexpected-question-mark-instead-of-tag-name');
                    this.create({ type: Type.Comment, data: '' });
                    this.reconsumeIn(State.BogusComment);
                    break;
                case undefined:
                    this.parseError('eof-before-tag-name');
                    this.emit({ type: Type.Character, data: '\u003C' });
                    this.emit({ type: Type.EndOfFile });
                    break;
                default: {
                    if (this.asciiAlpha(this.currentInputCharacter)) {
                        this.create({ type: Type.StartTag, name: '', attributes: new AttributeList() });
                        this.reconsumeIn(State.TagName);
                        break;
                    }

                    this.parseError('invalid-first-character-of-tag-name');
                    this.emit({ type: Type.Character, data: '\u003C' });
                    this.reconsumeIn(State.Data);
                }
            }

            break;
        }
        case State.EndTagOpen: {
            switch (this.consumeNext()) {
                case '\u003E': this.parseError('missing-end-tag-name'); this.state = State.Data; break;
                case undefined:
                    this.parseError('eof-before-tag-name');
                    this.emit({ type: Type.Character, data: '\u003C' });
                    this.emit({ type: Type.Character, data: '\u002F' });
                    this.emit({ type: Type.EndOfFile });
                    break;
                default: {
                    if (this.asciiAlpha(this.currentInputCharacter)) {
                        this.create({ type: Type.EndTag, name: '', attributes: new AttributeList() });
                        this.reconsumeIn(State.TagName);
                        break;
                    }

                    this.parseError('invalid-first-character-of-tag-name');
                    this.create({ type: Type.Comment, data: '' });
                    this.reconsumeIn(State.BogusComment);
                }
            }

            break;
        }
        case State.MarkupDeclarationOpen: {
            if (this.matchNextFew('--')) {
                this.consumeNextFew('--');
                this.create({ type: Type.Comment, data: '' });
                this.state = State.CommentStart;
            } else if (this.matchNextFewCaseInsensitive('DOCTYPE')) {
                this.consumeNextFewCaseInsensitive('DOCTYPE');
                this.state = State.DOCTYPE;
            } else if (this.matchNextFew('[CDATA[')) {
                this.consumeNextFew('[CDATA[');
                // NOTE: This parser will never be generated as part of the fragment parsing algorithm, as such the CDATA section state does not
                //       exist and will not be started here.
                this.parseError('cdata-in-html-content');
                this.create({ type: Type.Comment, data: '[CDATA[' });
                this.state = State.BogusComment;
            } else {
                this.parseError('incorrectly-opened-comment');
                this.create({ type: Type.Comment, data: '' });
                this.state = State.BogusComment;
            }

            break;
        }
        case State.DOCTYPE: {
            switch (this.consumeNext()) {
                case '\u0009':
                case '\u000A':
                case '\u000C':
                case '\u0020': this.state = State.BeforeDOCTYPEName; break;
                case '\u003E': this.reconsumeIn(State.BeforeDOCTYPEName); break;
                case undefined:
                    this.parseError('eof-in-doctype');
                    this.emit({ type: Type.DOCTYPE, forceQuirks: true });
                    this.emit({ type: Type.EndOfFile });
                    break;
                default:
                    this.parseError('missing-whitespace-before-doctype-name');
                    this.reconsumeIn(State.BeforeDOCTYPEName);
            }

            break;
        }
        case State.BeforeDOCTYPEName: {
            switch (this.consumeNext()) {
                case '\u0009':
                case '\u000A':
                case '\u000C':
                case '\u0020': break;
                case '\u0000':
                    this.parseError('unexpected-null-character');
                    this.create({ type: Type.DOCTYPE, name: '\uFFFD' });
                    this.state = State.DOCTYPEName;
                    break;
                case undefined:
                    this.parseError('eof-in-doctype');
                    this.emit({ type: Type.DOCTYPE, forceQuirks: true });
                    this.emit({ type: Type.EndOfFile });
                    break;
                default: {
                    if (this.asciiUpperAlpha(this.currentInputCharacter)) {
                        this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter.toLowerCase()});
                        this.state = State.DOCTYPEName;
                        break;
                    }

                    this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter });
                    this.state = State.DOCTYPE;
                }
            }

            break;
        }
        case State.DOCTYPEName: {
            switch (this.consumeNext()) {
                case '\u0009':
                case '\u000A':
                case '\u000C':
                case '\u0020': this.state = State.AfterDOCTYPEName; break;
                case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.DOCTYPE); break;
                case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.DOCTYPE)!.name += '\uFFFD'; break;
                case undefined:
                    this.parseError('eof-in-doctype');
                    this.currentOfType(Type.DOCTYPE).forceQuirks = true;
                    this.emitCurrentOfType(Type.DOCTYPE);
                    this.emit({ type: Type.EndOfFile });
                    break;
                default: {
                    if (this.asciiUpperAlpha(this.currentInputCharacter)) {
                        this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter.toLowerCase();
                        break;
                    }

                    this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter;
                }
            }

            break;
        }
        case State.TagName: {
            switch (this.consumeNext()) {
                case '\u0009':
                case '\u000A':
                case '\u000C':
                case '\u0020': this.state = State.BeforeAttributeName; break;
                case '\u002F': this.state = State.SelfClosingStartTag; break;
                case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
                case '\u0000':
                    this.parseError('unexpected-null-character');
                    this.currentOfEitherType(Type.StartTag, Type.EndTag).name += '\uFFFD';
                    break;
                case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
                default: {
                    if (this.asciiUpperAlpha(this.currentInputCharacter)) {
                        this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter.toLowerCase();
                        break;
                    }

                    this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter;
                }
            }

            break;
        }
        case State.BeforeAttributeName: {
            switch (this.consumeNext()) {
                case '\u0009':
                case '\u000A':
                case '\u000C':
                case '\u0020': break;
                case '\u002F':
                case '\u003E':
                case undefined: this.reconsumeIn(State.AfterAttributeName); break;
                case '\u003D': {
                    this.parseError('unexpected-equals-sign-before-attribute-name');
                    this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: this.currentInputCharacter, value: '' });
                    this.state = State.AttributeName;
                    break;
                }
                default: {
                    this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' });
                    this.reconsumeIn(State.AttributeName);
                }
            }

            break;
        }
        case State.AttributeName: {
            switch (this.consumeNext()) {
                case '\u0009':
                case '\u000A':
                case '\u000C':
                case '\u0020':
                case '\u002F':
                case '\u003E':
                case undefined: this.reconsumeIn(State.AfterAttributeName); break;
                case '\u003D': this.state = State.BeforeAttributeValue; break;
                case '\u0000': this.parseError('unexpected-null-character');
                    this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += '\uFFFD';
                    break;
                case '\u0022':
                case '\u0027':
                case '\u003C':
                    this.parseError('unexpected-character-in-attribute-name');
                    this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter;
                    break;
                default: {
                    if (this.asciiUpperAlpha(this.currentInputCharacter)) {
                        this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter.toLowerCase();
                        break;
                    }

                    this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter;
                }
            }

            break;
        }
        case State.AfterAttributeName: {
            switch (this.consumeNext()) {
                case '\u0009':
                case '\u000A':
                case '\u000C':
                case '\u0020': break;
                case '\u002F': this.state = State.SelfClosingStartTag; break;
                case '\u003D': this.state = State.BeforeAttributeValue; break;
                case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
                case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
                default:
                    this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' });
                    this.reconsumeIn(State.AttributeName);
                    break;
            }

            break;
        }
        case State.BeforeAttributeValue: {
            switch (this.consumeNext()) {
                case '\u0009':
                case '\u000A':
                case '\u000C':
                case '\u0020': break;
                case '\u0022': this.state = State.AttributeValueDouble; break;
                case '\u0027': this.state = State.AttributeValueSingle; break;
                case '\u003E':
                    this.parseError('missing-attribute-value');
                    this.state = State.Data;
                    this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag);
                    break;
                default:
                    this.reconsumeIn(State.AttributeValueUnquoted);
            }

            break;
        }
        case State.AttributeValueDouble: {
            switch (this.consumeNext()) {
                case '\u0022': this.state = State.AfterAttributeValue; break;
                case '\u0026': this.returnState = State.AttributeValueDouble; this.state = State.CharacterReference; break;
                case '\u0000':
                    this.parseError('unexpected-null-character');
                    this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
                    break;
                case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
                default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
            }

            break;
        }
        case State.AttributeValueSingle: {
            switch (this.consumeNext()) {
                case '\u0027': this.state = State.AfterAttributeValue; break;
                case '\u0026': this.returnState = State.AttributeValueSingle; this.state = State.CharacterReference; break;
                case '\u0000':
                    this.parseError('unexpected-null-character');
                    this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
                    break;
                case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
                default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
            }

            break;
        }
        case State.AttributeValueUnquoted: {
            switch (this.consumeNext()) {
                case '\u0009':
                case '\u000A':
                case '\u000C':
                case '\u0020': this.state = State.BeforeAttributeName; break;
                case '\u0026': this.returnState = State.AttributeValueUnquoted; this.state = State.CharacterReference; break;
                case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
                case '\u0000':
                    this.parseError('unexpected-null-character');
                    this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
                    break;
                case '\u0022':
                case '\u0027':
                case '\u003C':
                case '\u003D':
                case '\u0060':
                    this.parseError('unexpected-character-in-unquoted-attribute-value');
                    this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
                    break;
                case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
                default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
            }

            break;
        }
        case State.AfterAttributeValue: {
            switch (this.consumeNext()) {
                case '\u0009':
                case '\u000A':
                case '\u000C':
                case '\u0020': this.state = State.BeforeAttributeName; break;
                case '\u002F': this.state = State.SelfClosingStartTag; break;
                case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
                case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
                default: this.parseError('missing-whitespace-between-attributes'); this.reconsumeIn(State.BeforeAttributeName);
            }

            break;
        }
        case State.CommentStart: {
            switch (this.consumeNext()) {
                case '\u002D': this.state = State.CommentStartDash; break;
                case '\u003E': this.parseError('abrupt-closing-of-empty-comment'); this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
                default: this.reconsumeIn(State.Comment);
            }

            break;
        }
        // FIXME: Possible improvement to https://html.spec.whatwg.org/multipage/parsing.html#comment-state (adding **current** in some places)
        case State.Comment: {
            switch (this.consumeNext()) {
                case '\u003C': this.currentOfType(Type.Comment).data += this.currentInputCharacter; this.state = State.CommentLessThanSign; break;
                case '\u002D': this.state = State.CommentEndDash; break;
                case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break;
                case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
                default: this.currentOfType(Type.Comment).data += this.currentInputCharacter;
            }

            break;
        }
        case State.CommentEndDash: {
            switch (this.consumeNext()) {
                case '\u002D': this.state = State.CommentEnd; break;
                case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
                default: this.currentOfType(Type.Comment).data += '\u002D'; this.reconsumeIn(State.Comment);
            }

            break;
        }
        // Same as above fixme https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
        case State.CommentEnd: {
            switch (this.consumeNext()) {
                case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
                case '\u0021': this.state = State.CommentEndBang; break;
                case '\u002D': this.currentOfType(Type.Comment).data += '\u002D'; break;
                case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
                default: this.currentOfType(Type.Comment).data += '\u002D\u002D'; this.reconsumeIn(State.Comment);
            }

            break;
        }
        // Same as above https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
        case State.BogusComment: {
            switch (this.consumeNext()) {
                case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
                case undefined: this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
                case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break;
                default: this.currentOfType(Type.Comment).data += this.currentInputCharacter;
            }

            break;
        }
        case State.CharacterReference: {
            this.temporaryBuffer = '';
            this.temporaryBuffer += '\u0026';

            switch (this.consumeNext()) {
                case '\u0023': this.temporaryBuffer += this.currentInputCharacter; this.state = State.NumericCharacterReference; break;
                default: {
                    if (this.asciiAlphanumeric(this.currentInputCharacter)) {
                        this.reconsumeIn(State.NamedCharacterReference);
                        break;
                    }

                    this.flushCodePointsConsumedAsCharacterReference();
                    this.reconsumeIn(this.returnState);
                }
            }

            break;
        }
        case State.NamedCharacterReference: {
            let match = false;

            for (const entry in entities) {
                if (this.matchNextFew(entry)) {
                    match = true;

                    this.consumeNextFew(entry);
                    this.temporaryBuffer += entry;

                    if (this.consumedAsPartOfAnAttribute() && entry[entry.length - 1] !== '\u003B' && (this.next() === '\u003D' || this.asciiAlphanumeric(this.next() ?? ''))) {
                        this.flushCodePointsConsumedAsCharacterReference();
                        this.state = this.returnState;
                        break;
                    }

                    if (entry[entry.length - 1] !== '\u003B')
                        this.parseError('missing-semicolon-after-character-reference');

                    this.temporaryBuffer = '';
                    this.temporaryBuffer += entities[entry].characters;
                    this.flushCodePointsConsumedAsCharacterReference();
                    this.state = this.returnState;

                    break;
                }
            }

            if (!match) {
                this.flushCodePointsConsumedAsCharacterReference();
                this.state = State.AmbiguousAmpersand;
            }

            break;
        }
        case State.AmbiguousAmpersand: {
            switch (this.consumeNext()) {
                case '\u003B': this.parseError('unknown-named-character-reference'); this.reconsumeIn(this.returnState); break;
                default: {
                    if (this.asciiAlphanumeric(this.currentInputCharacter)) {
                        if (this.consumedAsPartOfAnAttribute()) {
                            this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
                        } else {
                            this.emit({ type: Type.Character, data: this.currentInputCharacter });
                        }

                        break;
                    }

                    this.reconsumeIn(this.returnState);
                }
            }

            break;
        }
        default: throw new Error(`FIXME (Tokenizer#iterate, Unimplemented state '${this.state}')`);
        }
    }

    private flushCodePointsConsumedAsCharacterReference(): void {
        if (this.consumedAsPartOfAnAttribute())  {
            this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.temporaryBuffer;
            return;
        }

        for (const codePoint of this.temporaryBuffer)
            this.emit({ type: Type.Character, data: codePoint });
    }

    private consumedAsPartOfAnAttribute(): boolean {
        return this.returnState === State.AttributeValueDouble || this.returnState === State.AttributeValueSingle || this.returnState === State.AttributeValueUnquoted;
    }

    private asciiAlphanumeric(input: string): boolean {
        return this.asciiAlpha(input) || this.asciiDigit(input);
    }

    private asciiAlpha(input: string): boolean {
        return this.asciiUpperAlpha(input) || this.asciiLowerAlpha(input);
    }

    private asciiUpperAlpha(input: string): boolean {
        return /[^\u0041-\u005A]/.test(input);
    }

    private asciiLowerAlpha(input: string): boolean {
        return /[^\u0061-\u007A]/.test(input);
    }

    private asciiDigit(input: string): boolean {
        return /[^\u0030-\u0030]/.test(input);
    }

    private reconsumeIn(state: State): void {
        this.pointer--;
        this.state = state;
        this.spin();
    }

    private parseError(error: ParseError): void {
        console.error('Parse error: ' + error);
    }

    private consumeNext(): string | undefined {
        this.currentInputCharacter = this.input[this.pointer];
        this.pointer++;

        return this.currentInputCharacter;
    }

    private next(): string | undefined {
        return this.input[this.pointer];
    }

    private matchNextFew(input: string): boolean {
        return this.input.substr(this.pointer, input.length) === input;
    }

    private matchNextFewCaseInsensitive(input: string): boolean {
        return this.input.substr(this.pointer, input.length).toLowerCase() === input.toLowerCase();
    }

    private consumeNextFew(input: string): void {
        for (let i = 0; i < input.length; i++) {
            const consumed = this.consumeNext();

            console.assert(consumed === input[i], {
                message: `Tokenizer#consumeNextFew: Expected '${input[i]}' (${input} at ${i}), got ${consumed} instead`
            });
        }
    }

    private consumeNextFewCaseInsensitive(input: string): void {
        for (let i = 0; i < input.length; i++) {
            const consumed = this.consumeNext()?.toLowerCase();

            console.assert(consumed === input[i].toLowerCase(), {
                message: `Tokenizer#consumeNextFewCaseInsensitive: Expected '${input[i].toLowerCase()}' (${input.toLowerCase()} at ${i}), got ${consumed} instead`
            });
        }
    }

    private emit(token: Token): void {
        this.tokens.push(token);
    }

    private emitCurrentOfType(type: Type): void {
        console.assert(this.currentToken.type === type, {
            message: `Tokenizer#emitCurrentOfType: Expected '${type}', got '${this.currentToken.type}' instead`
        });

        this.tokens.push(this.currentToken);
    }

    private emitCurrentOfEitherType(a: Type, b: Type): void {
        console.assert(this.currentToken.type === a || this.currentToken.type === b, {
            message: `Tokenizer#emitCurrentOfEitherType: Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`
        });

        this.tokens.push(this.currentToken);
    }

    private currentOfType<T extends Type>(type: T): Token & { type: T } {
        console.assert(this.currentToken.type === type, {
            message: `Tokenizer#currentOfType: Expected '${type}', got '${this.currentToken.type}' instead`
        });

        return this.currentToken as Token & { type: T };
    }

    private currentOfEitherType<T extends Type, U extends Type>(a: T, b: U): Token & { type: T | U } {
        console.assert(this.currentToken.type === a || this.currentToken.type === b, {
            message: `Tokenizer#currentOfEitherType: Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`
        });

        return this.currentToken as Token & { type: T };
    }

    private create(token: Token): Token {
        return this.currentToken = token;
    }
}