import { TODO, VERIFY, VERIFY_NOT_REACHED } from "../util/assertions.js"; import { Constructor } from "../util/guards.js"; import { ParseError } from "./errors.js"; import { entities } from "./tokenizer/entities.js"; import { State } from "./tokenizer/state.js"; import { Attribute, CharacterToken, CommentToken, DOCTYPEToken, EndOfFileToken, EndTagToken, Position, StartTagToken, Token } from "./tokenizer/token.js"; export class Tokenizer { private state: State = State.Data; private returnState!: State; private temporaryBuffer!: string; private currentToken!: Token; private currentInputCharacter!: string; private currentPosition: Position = { line: 0, column: 0, index: 0 }; public tokens: Array = new Array(); private pointer: number = 0; public constructor(private input: string) { } public spin(): void { switch (this.state) { case State.Data: { switch (this.consumeNext()) { case '\u0026': this.returnState = State.Data; this.state = State.CharacterReference; break; case '\u003C': this.state = State.TagOpen; break; case '\u0000': this.parseError('unexpected-null-character'); this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition)); break; case undefined: this.emit(EndOfFileToken.create()); break; default: this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition)); } break; } case State.RCDATA: { switch (this.consumeNext()) { case '\u003C': this.state = State.RAWTEXTLessThan; break; case '\u0000': this.parseError('unexpected-null-character'); this.emit(CharacterToken.createReplacementCharacter().at(this.currentPosition)); break; case undefined: this.emit(EndOfFileToken.create()); break; default: this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition)); } break; } case State.TagOpen: { switch (this.consumeNext()) { case '\u0021': this.state = State.MarkupDeclarationOpen; break; case '\u002F': this.state = State.EndTagOpen; break; case '\u003F': this.parseError('unexpected-question-mark-instead-of-tag-name'); this.create(CommentToken.createEmpty().startingAt(this.currentPosition)); this.reconsumeIn(State.BogusComment); break; case undefined: this.parseError('eof-before-tag-name'); this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition)); this.emit(EndOfFileToken.create()); break; default: { if (this.asciiAlpha(this.currentInputCharacter)) { this.create(StartTagToken.createEmpty().startingAt(this.currentPosition)); this.reconsumeIn(State.TagName); break; } this.parseError('invalid-first-character-of-tag-name'); this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition)); this.reconsumeIn(State.Data); } } break; } case State.EndTagOpen: { switch (this.consumeNext()) { case '\u003E': this.parseError('missing-end-tag-name'); this.state = State.Data; break; case undefined: this.parseError('eof-before-tag-name'); this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition)); this.emit(CharacterToken.createWith('\u002F').at(this.currentPosition)); this.emit(EndOfFileToken.create()); break; default: { if (this.asciiAlpha(this.currentInputCharacter)) { this.create(EndTagToken.createEmpty().startingAt(this.currentPosition)); this.reconsumeIn(State.TagName); break; } this.parseError('invalid-first-character-of-tag-name'); this.create(CommentToken.createEmpty().startingAt(this.currentPosition)); this.reconsumeIn(State.BogusComment); } } break; } case State.MarkupDeclarationOpen: { if (this.matchNextFew('--')) { this.consumeNextFew('--'); this.create(CommentToken.createEmpty().startingAt(this.currentPosition)); this.state = State.CommentStart; } else if (this.matchNextFewCaseInsensitive('DOCTYPE')) { this.consumeNextFewCaseInsensitive('DOCTYPE'); this.state = State.DOCTYPE; } else if (this.matchNextFew('[CDATA[')) { this.consumeNextFew('[CDATA['); // NOTE: This parser will never be generated as part of the fragment parsing algorithm, as such the CDATA section state does not // exist and will not be started here. this.parseError('cdata-in-html-content'); this.create(CommentToken.createWith('[CDATA[').startingAt(this.currentPosition)); this.state = State.BogusComment; } else { this.parseError('incorrectly-opened-comment'); this.create(CommentToken.createEmpty().startingAt(this.currentPosition)); this.state = State.BogusComment; } break; } case State.DOCTYPE: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': this.state = State.BeforeDOCTYPEName; break; case '\u003E': this.reconsumeIn(State.BeforeDOCTYPEName); break; case undefined: this.parseError('eof-in-doctype'); this.emit(DOCTYPEToken.createWithForcedQuirks().at(this.currentPosition)); this.emit(EndOfFileToken.create()); break; default: this.parseError('missing-whitespace-before-doctype-name'); this.reconsumeIn(State.BeforeDOCTYPEName); } break; } case State.BeforeDOCTYPEName: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': break; case '\u0000': this.parseError('unexpected-null-character'); this.create(DOCTYPEToken.createWithName('\uFFFD').startingAt(this.currentPosition)); this.state = State.DOCTYPEName; break; case undefined: this.parseError('eof-in-doctype'); this.emit(DOCTYPEToken.createWithForcedQuirks().at(this.currentPosition)); this.emit(EndOfFileToken.create()); break; default: { if (this.asciiUpperAlpha(this.currentInputCharacter)) { this.create(DOCTYPEToken.createWithName(this.currentInputCharacter.toLowerCase()).startingAt(this.currentPosition)); this.state = State.DOCTYPEName; break; } this.create(DOCTYPEToken.createWithName(this.currentInputCharacter).startingAt(this.currentPosition)); this.state = State.DOCTYPE; } } break; } case State.DOCTYPEName: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': this.state = State.AfterDOCTYPEName; break; case '\u003E': this.state = State.Data; this.emitCurrentOfType(DOCTYPEToken); break; case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(DOCTYPEToken).appendReplacementCharacterToName(); break; case undefined: this.parseError('eof-in-doctype'); this.currentOfType(DOCTYPEToken).forceQuirks = true; this.emitCurrentOfType(DOCTYPEToken); this.emit(EndOfFileToken.create()); break; default: { if (this.asciiUpperAlpha(this.currentInputCharacter)) { this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter.toLowerCase()); break; } this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter); } } break; } case State.TagName: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': this.state = State.BeforeAttributeName; break; case '\u002F': this.state = State.SelfClosingStartTag; break; case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break; case '\u0000': this.parseError('unexpected-null-character'); this.currentOfEitherType(StartTagToken, EndTagToken).appendReplacementCharacterToName(); break; case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; default: { if (this.asciiUpperAlpha(this.currentInputCharacter)) { this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter.toLowerCase()); break; } this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter); } } break; } case State.BeforeAttributeName: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': break; case '\u002F': case '\u003E': case undefined: this.reconsumeIn(State.AfterAttributeName); break; case '\u003D': { this.parseError('unexpected-equals-sign-before-attribute-name'); this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyValue(this.currentInputCharacter)); this.state = State.AttributeName; break; } default: { this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyNameAndValue()); this.reconsumeIn(State.AttributeName); } } break; } case State.AttributeName: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': case '\u002F': case '\u003E': case undefined: this.reconsumeIn(State.AfterAttributeName); break; case '\u003D': this.state = State.BeforeAttributeValue; break; case '\u0000': this.parseError('unexpected-null-character'); this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToName(); break; case '\u0022': case '\u0027': case '\u003C': this.parseError('unexpected-character-in-attribute-name'); this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter); break; default: { if (this.asciiUpperAlpha(this.currentInputCharacter)) { this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter.toLowerCase()); break; } this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter); } } break; } case State.AfterAttributeName: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': break; case '\u002F': this.state = State.SelfClosingStartTag; break; case '\u003D': this.state = State.BeforeAttributeValue; break; case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break; case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyNameAndValue()); this.reconsumeIn(State.AttributeName); break; } break; } case State.BeforeAttributeValue: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': break; case '\u0022': this.state = State.AttributeValueDouble; break; case '\u0027': this.state = State.AttributeValueSingle; break; case '\u003E': this.parseError('missing-attribute-value'); this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break; default: this.reconsumeIn(State.AttributeValueUnquoted); } break; } case State.AttributeValueDouble: { switch (this.consumeNext()) { case '\u0022': this.state = State.AfterAttributeValue; break; case '\u0026': this.returnState = State.AttributeValueDouble; this.state = State.CharacterReference; break; case '\u0000': this.parseError('unexpected-null-character'); this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue(); break; case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter); } break; } case State.AttributeValueSingle: { switch (this.consumeNext()) { case '\u0027': this.state = State.AfterAttributeValue; break; case '\u0026': this.returnState = State.AttributeValueSingle; this.state = State.CharacterReference; break; case '\u0000': this.parseError('unexpected-null-character'); this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue(); break; case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter); } break; } case State.AttributeValueUnquoted: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': this.state = State.BeforeAttributeName; break; case '\u0026': this.returnState = State.AttributeValueUnquoted; this.state = State.CharacterReference; break; case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break; case '\u0000': this.parseError('unexpected-null-character'); this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue(); break; case '\u0022': case '\u0027': case '\u003C': case '\u003D': case '\u0060': this.parseError('unexpected-character-in-unquoted-attribute-value'); this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter); break; case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter); } break; } case State.AfterAttributeValue: { switch (this.consumeNext()) { case '\u0009': case '\u000A': case '\u000C': case '\u0020': this.state = State.BeforeAttributeName; break; case '\u002F': this.state = State.SelfClosingStartTag; break; case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break; case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; default: this.parseError('missing-whitespace-between-attributes'); this.reconsumeIn(State.BeforeAttributeName); } break; } case State.CommentStart: { switch (this.consumeNext()) { case '\u002D': this.state = State.CommentStartDash; break; case '\u003E': this.parseError('abrupt-closing-of-empty-comment'); this.state = State.Data; this.emitCurrentOfType(CommentToken); break; default: this.reconsumeIn(State.Comment); } break; } // FIXME: Possible improvement to https://html.spec.whatwg.org/multipage/parsing.html#comment-state (adding **current** in some places) case State.Comment: { switch (this.consumeNext()) { case '\u003C': this.currentOfType(CommentToken).append(this.currentInputCharacter); this.state = State.CommentLessThanSign; break; case '\u002D': this.state = State.CommentEndDash; break; case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(CommentToken).appendReplacementCharacter(); break; case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break; default: this.currentOfType(CommentToken).append(this.currentInputCharacter); } break; } case State.CommentEndDash: { switch (this.consumeNext()) { case '\u002D': this.state = State.CommentEnd; break; case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break; default: this.currentOfType(CommentToken).append('\u002D'); this.reconsumeIn(State.Comment); } break; } // Same as above fixme https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state case State.CommentEnd: { switch (this.consumeNext()) { case '\u003E': this.state = State.Data; this.emitCurrentOfType(CommentToken); break; case '\u0021': this.state = State.CommentEndBang; break; case '\u002D': this.currentOfType(CommentToken).append('\u002D'); break; case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break; default: this.currentOfType(CommentToken).append('\u002D\u002D'); this.reconsumeIn(State.Comment); } break; } // Same as above https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state case State.BogusComment: { switch (this.consumeNext()) { case '\u003E': this.state = State.Data; this.emitCurrentOfType(CommentToken); break; case undefined: this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break; case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(CommentToken).appendReplacementCharacter(); break; default: this.currentOfType(CommentToken).append(this.currentInputCharacter); } break; } case State.CharacterReference: { this.temporaryBuffer = ''; this.temporaryBuffer += '\u0026'; switch (this.consumeNext()) { case '\u0023': this.temporaryBuffer += this.currentInputCharacter; this.state = State.NumericCharacterReference; break; default: { if (this.asciiAlphanumeric(this.currentInputCharacter)) { this.reconsumeIn(State.NamedCharacterReference); break; } this.flushCodePointsConsumedAsCharacterReference(); this.reconsumeIn(this.returnState); } } break; } case State.NamedCharacterReference: { let match = false; for (const entry in entities) { if (this.matchNextFew(entry)) { match = true; this.consumeNextFew(entry); this.temporaryBuffer += entry; if (this.consumedAsPartOfAnAttribute() && entry[entry.length - 1] !== '\u003B' && (this.next() === '\u003D' || this.asciiAlphanumeric(this.next() ?? ''))) { this.flushCodePointsConsumedAsCharacterReference(); this.state = this.returnState; break; } if (entry[entry.length - 1] !== '\u003B') this.parseError('missing-semicolon-after-character-reference'); this.temporaryBuffer = ''; this.temporaryBuffer += entities[entry].characters; this.flushCodePointsConsumedAsCharacterReference(); this.state = this.returnState; break; } } if (!match) { this.flushCodePointsConsumedAsCharacterReference(); this.state = State.AmbiguousAmpersand; } break; } case State.AmbiguousAmpersand: { switch (this.consumeNext()) { case '\u003B': this.parseError('unknown-named-character-reference'); this.reconsumeIn(this.returnState); break; default: { if (this.asciiAlphanumeric(this.currentInputCharacter)) { if (this.consumedAsPartOfAnAttribute()) { this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter); } else { this.emit(CharacterToken.createWith(this.currentInputCharacter)); } break; } this.reconsumeIn(this.returnState); } } break; } default: TODO(`Unimplemented state '${this.state}'`); } } private flushCodePointsConsumedAsCharacterReference(): void { if (this.consumedAsPartOfAnAttribute()) { this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.temporaryBuffer); return; } for (const codePoint of this.temporaryBuffer) this.emit(CharacterToken.createWith(codePoint)); } private consumedAsPartOfAnAttribute(): boolean { return this.returnState === State.AttributeValueDouble || this.returnState === State.AttributeValueSingle || this.returnState === State.AttributeValueUnquoted; } private asciiAlphanumeric(input: string): boolean { return this.asciiAlpha(input) || this.asciiDigit(input); } private asciiAlpha(input: string): boolean { return this.asciiUpperAlpha(input) || this.asciiLowerAlpha(input); } private asciiUpperAlpha(input: string): boolean { return /[\u0041-\u005A]/.test(input); } private asciiLowerAlpha(input: string): boolean { return /[\u0061-\u007A]/.test(input); } private asciiDigit(input: string): boolean { return /[\u0030-\u0030]/.test(input); } private reconsumeIn(state: State): void { this.pointer--; this.state = state; this.spin(); } private parseError(error: ParseError): void { console.error('Parse error: ' + error); } private consumeNext(): string | undefined { this.currentInputCharacter = this.input[this.pointer]; this.pointer++; this.currentPosition.column++; this.currentPosition.index++; if (this.currentInputCharacter === '\n') { this.currentPosition.column = 0; this.currentPosition.line++; } return this.currentInputCharacter; } private next(): string | undefined { return this.input[this.pointer]; } private matchNextFew(input: string): boolean { return this.input.substr(this.pointer, input.length) === input; } private matchNextFewCaseInsensitive(input: string): boolean { return this.input.substr(this.pointer, input.length).toLowerCase() === input.toLowerCase(); } private consumeNextFew(input: string): void { for (let i = 0; i < input.length; i++) { const consumed = this.consumeNext(); VERIFY(consumed === input[i], `Expected '${input[i]}' (${input} at ${i}), got ${consumed} instead`); } } private consumeNextFewCaseInsensitive(input: string): void { for (let i = 0; i < input.length; i++) { const consumed = this.consumeNext()?.toLowerCase(); VERIFY(consumed === input[i].toLowerCase(), `Expected '${input[i].toLowerCase()}' (${input.toLowerCase()} at ${i}), got ${consumed} instead`); } } private emit(token: Token): void { this.populateRangeOnEmit(token); this.tokens.push(token); } private emitCurrentOfType(type: Constructor): void { VERIFY(this.currentToken instanceof type, `Expected '${type.name}', got '${this.currentToken.constructor.name}' instead`); this.populateRangeOnEmit(this.currentToken); this.tokens.push(this.currentToken); } private emitCurrentOfEitherType(a: Constructor, b: Constructor): void { VERIFY(this.currentToken instanceof a || this.currentToken instanceof b, `Expected '${a.name}' or '${b.name}', got '${this.currentToken.constructor.name}' instead`); this.populateRangeOnEmit(this.currentToken); this.tokens.push(this.currentToken); } private currentOfType(type: Constructor): T { VERIFY(this.currentToken instanceof type, `Expected '${type.name}', got '${this.currentToken.constructor.name}' instead`); this.populateRangeOnEmit(this.currentToken); return this.currentToken; } private currentOfEitherType(a: Constructor, b: Constructor): T | U { VERIFY(this.currentToken instanceof a || this.currentToken instanceof b, `Expected '${a.name}' or '${b.name}', got '${this.currentToken.constructor.name}' instead`); this.populateRangeOnEmit(this.currentToken); return this.currentToken; } private populateRangeOnEmit(token: Token): void { if (token.range.start === undefined && token.range.end === undefined) token.at(this.currentPosition); if (token.range.start !== undefined && token.range.end === undefined) token.endingAt(this.currentPosition); if (token.range.start === undefined && token.range.end !== undefined) VERIFY_NOT_REACHED(); } private create(token: Token): Token { if (token.range.start === undefined) token.startingAt(this.currentPosition); return this.currentToken = token; } }