2021-10-24 23:06:35 +02:00
|
|
|
import { TODO, VERIFY } from "../util/assertions.js";
|
2021-10-24 22:36:38 +02:00
|
|
|
import { ParseError } from "./errors.js";
|
|
|
|
import { entities } from "./tokenizer/entities.js";
|
|
|
|
import { State } from "./tokenizer/state.js";
|
|
|
|
import { AttributeList, Token, Type } from "./tokenizer/token.js";
|
|
|
|
|
|
|
|
export class Tokenizer {
|
|
|
|
private state: State = State.Data;
|
|
|
|
private returnState!: State;
|
|
|
|
|
|
|
|
private temporaryBuffer!: string;
|
|
|
|
|
|
|
|
private currentToken!: Token;
|
|
|
|
private currentInputCharacter!: string;
|
|
|
|
|
|
|
|
public tokens: Array<Token> = new Array<Token>();
|
|
|
|
private pointer: number = 0;
|
|
|
|
|
|
|
|
public constructor(private input: string) {
|
|
|
|
}
|
|
|
|
|
|
|
|
public spin(): void {
|
|
|
|
switch (this.state) {
|
|
|
|
case State.Data: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u0026':
|
|
|
|
this.returnState = State.Data;
|
|
|
|
this.state = State.CharacterReference;
|
|
|
|
break;
|
|
|
|
case '\u003C': this.state = State.TagOpen; break;
|
|
|
|
case '\u0000':
|
|
|
|
this.parseError('unexpected-null-character');
|
|
|
|
this.emit({ type: Type.Character, data: this.currentInputCharacter });
|
|
|
|
break;
|
|
|
|
case undefined: this.emit({ type: Type.EndOfFile }); break;
|
|
|
|
default: this.emit({ type: Type.Character, data: this.currentInputCharacter });
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.RCDATA: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u003C': this.state = State.RAWTEXTLessThan; break;
|
|
|
|
case '\u0000': this.parseError('unexpected-null-character'); this.emit({ type: Type.Character, data: '\uFFFD' }); break;
|
|
|
|
case undefined: this.emit({ type: Type.EndOfFile }); break;
|
|
|
|
default: this.emit({ type: Type.Character, data: this.currentInputCharacter });
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.TagOpen: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u0021': this.state = State.MarkupDeclarationOpen; break;
|
|
|
|
case '\u002F': this.state = State.EndTagOpen; break;
|
|
|
|
case '\u003F':
|
|
|
|
this.parseError('unexpected-question-mark-instead-of-tag-name');
|
|
|
|
this.create({ type: Type.Comment, data: '' });
|
|
|
|
this.reconsumeIn(State.BogusComment);
|
|
|
|
break;
|
|
|
|
case undefined:
|
|
|
|
this.parseError('eof-before-tag-name');
|
|
|
|
this.emit({ type: Type.Character, data: '\u003C' });
|
|
|
|
this.emit({ type: Type.EndOfFile });
|
|
|
|
break;
|
|
|
|
default: {
|
|
|
|
if (this.asciiAlpha(this.currentInputCharacter)) {
|
|
|
|
this.create({ type: Type.StartTag, name: '', attributes: new AttributeList() });
|
|
|
|
this.reconsumeIn(State.TagName);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.parseError('invalid-first-character-of-tag-name');
|
|
|
|
this.emit({ type: Type.Character, data: '\u003C' });
|
|
|
|
this.reconsumeIn(State.Data);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.EndTagOpen: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u003E': this.parseError('missing-end-tag-name'); this.state = State.Data; break;
|
|
|
|
case undefined:
|
|
|
|
this.parseError('eof-before-tag-name');
|
|
|
|
this.emit({ type: Type.Character, data: '\u003C' });
|
|
|
|
this.emit({ type: Type.Character, data: '\u002F' });
|
|
|
|
this.emit({ type: Type.EndOfFile });
|
|
|
|
break;
|
|
|
|
default: {
|
|
|
|
if (this.asciiAlpha(this.currentInputCharacter)) {
|
|
|
|
this.create({ type: Type.EndTag, name: '', attributes: new AttributeList() });
|
|
|
|
this.reconsumeIn(State.TagName);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.parseError('invalid-first-character-of-tag-name');
|
|
|
|
this.create({ type: Type.Comment, data: '' });
|
|
|
|
this.reconsumeIn(State.BogusComment);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.MarkupDeclarationOpen: {
|
|
|
|
if (this.matchNextFew('--')) {
|
|
|
|
this.consumeNextFew('--');
|
|
|
|
this.create({ type: Type.Comment, data: '' });
|
|
|
|
this.state = State.CommentStart;
|
|
|
|
} else if (this.matchNextFewCaseInsensitive('DOCTYPE')) {
|
|
|
|
this.consumeNextFewCaseInsensitive('DOCTYPE');
|
|
|
|
this.state = State.DOCTYPE;
|
|
|
|
} else if (this.matchNextFew('[CDATA[')) {
|
|
|
|
this.consumeNextFew('[CDATA[');
|
|
|
|
// NOTE: This parser will never be generated as part of the fragment parsing algorithm, as such the CDATA section state does not
|
|
|
|
// exist and will not be started here.
|
|
|
|
this.parseError('cdata-in-html-content');
|
|
|
|
this.create({ type: Type.Comment, data: '[CDATA[' });
|
|
|
|
this.state = State.BogusComment;
|
|
|
|
} else {
|
|
|
|
this.parseError('incorrectly-opened-comment');
|
|
|
|
this.create({ type: Type.Comment, data: '' });
|
|
|
|
this.state = State.BogusComment;
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.DOCTYPE: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u0009':
|
|
|
|
case '\u000A':
|
|
|
|
case '\u000C':
|
|
|
|
case '\u0020': this.state = State.BeforeDOCTYPEName; break;
|
|
|
|
case '\u003E': this.reconsumeIn(State.BeforeDOCTYPEName); break;
|
|
|
|
case undefined:
|
|
|
|
this.parseError('eof-in-doctype');
|
|
|
|
this.emit({ type: Type.DOCTYPE, forceQuirks: true });
|
|
|
|
this.emit({ type: Type.EndOfFile });
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
this.parseError('missing-whitespace-before-doctype-name');
|
|
|
|
this.reconsumeIn(State.BeforeDOCTYPEName);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.BeforeDOCTYPEName: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u0009':
|
|
|
|
case '\u000A':
|
|
|
|
case '\u000C':
|
|
|
|
case '\u0020': break;
|
|
|
|
case '\u0000':
|
|
|
|
this.parseError('unexpected-null-character');
|
|
|
|
this.create({ type: Type.DOCTYPE, name: '\uFFFD' });
|
|
|
|
this.state = State.DOCTYPEName;
|
|
|
|
break;
|
|
|
|
case undefined:
|
|
|
|
this.parseError('eof-in-doctype');
|
|
|
|
this.emit({ type: Type.DOCTYPE, forceQuirks: true });
|
|
|
|
this.emit({ type: Type.EndOfFile });
|
|
|
|
break;
|
|
|
|
default: {
|
|
|
|
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
|
|
|
|
this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter.toLowerCase()});
|
|
|
|
this.state = State.DOCTYPEName;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter });
|
|
|
|
this.state = State.DOCTYPE;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.DOCTYPEName: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u0009':
|
|
|
|
case '\u000A':
|
|
|
|
case '\u000C':
|
|
|
|
case '\u0020': this.state = State.AfterDOCTYPEName; break;
|
|
|
|
case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.DOCTYPE); break;
|
|
|
|
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.DOCTYPE)!.name += '\uFFFD'; break;
|
|
|
|
case undefined:
|
|
|
|
this.parseError('eof-in-doctype');
|
|
|
|
this.currentOfType(Type.DOCTYPE).forceQuirks = true;
|
|
|
|
this.emitCurrentOfType(Type.DOCTYPE);
|
|
|
|
this.emit({ type: Type.EndOfFile });
|
|
|
|
break;
|
|
|
|
default: {
|
|
|
|
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
|
|
|
|
this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter.toLowerCase();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.TagName: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u0009':
|
|
|
|
case '\u000A':
|
|
|
|
case '\u000C':
|
|
|
|
case '\u0020': this.state = State.BeforeAttributeName; break;
|
|
|
|
case '\u002F': this.state = State.SelfClosingStartTag; break;
|
|
|
|
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
|
|
|
|
case '\u0000':
|
|
|
|
this.parseError('unexpected-null-character');
|
|
|
|
this.currentOfEitherType(Type.StartTag, Type.EndTag).name += '\uFFFD';
|
|
|
|
break;
|
|
|
|
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
|
|
|
|
default: {
|
|
|
|
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
|
|
|
|
this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter.toLowerCase();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.BeforeAttributeName: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u0009':
|
|
|
|
case '\u000A':
|
|
|
|
case '\u000C':
|
|
|
|
case '\u0020': break;
|
|
|
|
case '\u002F':
|
|
|
|
case '\u003E':
|
|
|
|
case undefined: this.reconsumeIn(State.AfterAttributeName); break;
|
|
|
|
case '\u003D': {
|
|
|
|
this.parseError('unexpected-equals-sign-before-attribute-name');
|
|
|
|
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: this.currentInputCharacter, value: '' });
|
|
|
|
this.state = State.AttributeName;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
default: {
|
|
|
|
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' });
|
|
|
|
this.reconsumeIn(State.AttributeName);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.AttributeName: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u0009':
|
|
|
|
case '\u000A':
|
|
|
|
case '\u000C':
|
|
|
|
case '\u0020':
|
|
|
|
case '\u002F':
|
|
|
|
case '\u003E':
|
|
|
|
case undefined: this.reconsumeIn(State.AfterAttributeName); break;
|
|
|
|
case '\u003D': this.state = State.BeforeAttributeValue; break;
|
|
|
|
case '\u0000': this.parseError('unexpected-null-character');
|
|
|
|
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += '\uFFFD';
|
|
|
|
break;
|
|
|
|
case '\u0022':
|
|
|
|
case '\u0027':
|
|
|
|
case '\u003C':
|
|
|
|
this.parseError('unexpected-character-in-attribute-name');
|
|
|
|
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter;
|
|
|
|
break;
|
|
|
|
default: {
|
|
|
|
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
|
|
|
|
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter.toLowerCase();
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.AfterAttributeName: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u0009':
|
|
|
|
case '\u000A':
|
|
|
|
case '\u000C':
|
|
|
|
case '\u0020': break;
|
|
|
|
case '\u002F': this.state = State.SelfClosingStartTag; break;
|
|
|
|
case '\u003D': this.state = State.BeforeAttributeValue; break;
|
|
|
|
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
|
|
|
|
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
|
|
|
|
default:
|
|
|
|
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' });
|
|
|
|
this.reconsumeIn(State.AttributeName);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.BeforeAttributeValue: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u0009':
|
|
|
|
case '\u000A':
|
|
|
|
case '\u000C':
|
|
|
|
case '\u0020': break;
|
|
|
|
case '\u0022': this.state = State.AttributeValueDouble; break;
|
|
|
|
case '\u0027': this.state = State.AttributeValueSingle; break;
|
|
|
|
case '\u003E':
|
|
|
|
this.parseError('missing-attribute-value');
|
|
|
|
this.state = State.Data;
|
|
|
|
this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag);
|
|
|
|
break;
|
|
|
|
default:
|
|
|
|
this.reconsumeIn(State.AttributeValueUnquoted);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.AttributeValueDouble: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u0022': this.state = State.AfterAttributeValue; break;
|
|
|
|
case '\u0026': this.returnState = State.AttributeValueDouble; this.state = State.CharacterReference; break;
|
|
|
|
case '\u0000':
|
|
|
|
this.parseError('unexpected-null-character');
|
|
|
|
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
|
|
|
|
break;
|
|
|
|
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
|
|
|
|
default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.AttributeValueSingle: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u0027': this.state = State.AfterAttributeValue; break;
|
|
|
|
case '\u0026': this.returnState = State.AttributeValueSingle; this.state = State.CharacterReference; break;
|
|
|
|
case '\u0000':
|
|
|
|
this.parseError('unexpected-null-character');
|
|
|
|
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
|
|
|
|
break;
|
|
|
|
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
|
|
|
|
default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.AttributeValueUnquoted: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u0009':
|
|
|
|
case '\u000A':
|
|
|
|
case '\u000C':
|
|
|
|
case '\u0020': this.state = State.BeforeAttributeName; break;
|
|
|
|
case '\u0026': this.returnState = State.AttributeValueUnquoted; this.state = State.CharacterReference; break;
|
|
|
|
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
|
|
|
|
case '\u0000':
|
|
|
|
this.parseError('unexpected-null-character');
|
|
|
|
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
|
|
|
|
break;
|
|
|
|
case '\u0022':
|
|
|
|
case '\u0027':
|
|
|
|
case '\u003C':
|
|
|
|
case '\u003D':
|
|
|
|
case '\u0060':
|
|
|
|
this.parseError('unexpected-character-in-unquoted-attribute-value');
|
|
|
|
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
|
|
|
|
break;
|
|
|
|
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
|
|
|
|
default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.AfterAttributeValue: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u0009':
|
|
|
|
case '\u000A':
|
|
|
|
case '\u000C':
|
|
|
|
case '\u0020': this.state = State.BeforeAttributeName; break;
|
|
|
|
case '\u002F': this.state = State.SelfClosingStartTag; break;
|
|
|
|
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
|
|
|
|
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
|
|
|
|
default: this.parseError('missing-whitespace-between-attributes'); this.reconsumeIn(State.BeforeAttributeName);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.CommentStart: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u002D': this.state = State.CommentStartDash; break;
|
|
|
|
case '\u003E': this.parseError('abrupt-closing-of-empty-comment'); this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
|
|
|
|
default: this.reconsumeIn(State.Comment);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// FIXME: Possible improvement to https://html.spec.whatwg.org/multipage/parsing.html#comment-state (adding **current** in some places)
|
|
|
|
case State.Comment: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u003C': this.currentOfType(Type.Comment).data += this.currentInputCharacter; this.state = State.CommentLessThanSign; break;
|
|
|
|
case '\u002D': this.state = State.CommentEndDash; break;
|
|
|
|
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break;
|
|
|
|
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
|
|
|
|
default: this.currentOfType(Type.Comment).data += this.currentInputCharacter;
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.CommentEndDash: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u002D': this.state = State.CommentEnd; break;
|
|
|
|
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
|
|
|
|
default: this.currentOfType(Type.Comment).data += '\u002D'; this.reconsumeIn(State.Comment);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// Same as above fixme https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
|
|
|
|
case State.CommentEnd: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
|
|
|
|
case '\u0021': this.state = State.CommentEndBang; break;
|
|
|
|
case '\u002D': this.currentOfType(Type.Comment).data += '\u002D'; break;
|
|
|
|
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
|
|
|
|
default: this.currentOfType(Type.Comment).data += '\u002D\u002D'; this.reconsumeIn(State.Comment);
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
// Same as above https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
|
|
|
|
case State.BogusComment: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
|
|
|
|
case undefined: this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
|
|
|
|
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break;
|
|
|
|
default: this.currentOfType(Type.Comment).data += this.currentInputCharacter;
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.CharacterReference: {
|
|
|
|
this.temporaryBuffer = '';
|
|
|
|
this.temporaryBuffer += '\u0026';
|
|
|
|
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u0023': this.temporaryBuffer += this.currentInputCharacter; this.state = State.NumericCharacterReference; break;
|
|
|
|
default: {
|
|
|
|
if (this.asciiAlphanumeric(this.currentInputCharacter)) {
|
|
|
|
this.reconsumeIn(State.NamedCharacterReference);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.flushCodePointsConsumedAsCharacterReference();
|
|
|
|
this.reconsumeIn(this.returnState);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.NamedCharacterReference: {
|
|
|
|
let match = false;
|
|
|
|
|
|
|
|
for (const entry in entities) {
|
|
|
|
if (this.matchNextFew(entry)) {
|
|
|
|
match = true;
|
|
|
|
|
|
|
|
this.consumeNextFew(entry);
|
|
|
|
this.temporaryBuffer += entry;
|
|
|
|
|
|
|
|
if (this.consumedAsPartOfAnAttribute() && entry[entry.length - 1] !== '\u003B' && (this.next() === '\u003D' || this.asciiAlphanumeric(this.next() ?? ''))) {
|
|
|
|
this.flushCodePointsConsumedAsCharacterReference();
|
|
|
|
this.state = this.returnState;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (entry[entry.length - 1] !== '\u003B')
|
|
|
|
this.parseError('missing-semicolon-after-character-reference');
|
|
|
|
|
|
|
|
this.temporaryBuffer = '';
|
|
|
|
this.temporaryBuffer += entities[entry].characters;
|
|
|
|
this.flushCodePointsConsumedAsCharacterReference();
|
|
|
|
this.state = this.returnState;
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!match) {
|
|
|
|
this.flushCodePointsConsumedAsCharacterReference();
|
|
|
|
this.state = State.AmbiguousAmpersand;
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
case State.AmbiguousAmpersand: {
|
|
|
|
switch (this.consumeNext()) {
|
|
|
|
case '\u003B': this.parseError('unknown-named-character-reference'); this.reconsumeIn(this.returnState); break;
|
|
|
|
default: {
|
|
|
|
if (this.asciiAlphanumeric(this.currentInputCharacter)) {
|
|
|
|
if (this.consumedAsPartOfAnAttribute()) {
|
|
|
|
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
|
|
|
|
} else {
|
|
|
|
this.emit({ type: Type.Character, data: this.currentInputCharacter });
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
|
|
|
this.reconsumeIn(this.returnState);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
break;
|
|
|
|
}
|
2021-10-24 23:11:34 +02:00
|
|
|
default: TODO(`Unimplemented state '${this.state}'`);
|
2021-10-24 22:36:38 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private flushCodePointsConsumedAsCharacterReference(): void {
|
|
|
|
if (this.consumedAsPartOfAnAttribute()) {
|
|
|
|
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.temporaryBuffer;
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (const codePoint of this.temporaryBuffer)
|
|
|
|
this.emit({ type: Type.Character, data: codePoint });
|
|
|
|
}
|
|
|
|
|
|
|
|
private consumedAsPartOfAnAttribute(): boolean {
|
|
|
|
return this.returnState === State.AttributeValueDouble || this.returnState === State.AttributeValueSingle || this.returnState === State.AttributeValueUnquoted;
|
|
|
|
}
|
|
|
|
|
|
|
|
private asciiAlphanumeric(input: string): boolean {
|
|
|
|
return this.asciiAlpha(input) || this.asciiDigit(input);
|
|
|
|
}
|
|
|
|
|
|
|
|
private asciiAlpha(input: string): boolean {
|
|
|
|
return this.asciiUpperAlpha(input) || this.asciiLowerAlpha(input);
|
|
|
|
}
|
|
|
|
|
|
|
|
private asciiUpperAlpha(input: string): boolean {
|
|
|
|
return /[^\u0041-\u005A]/.test(input);
|
|
|
|
}
|
|
|
|
|
|
|
|
private asciiLowerAlpha(input: string): boolean {
|
|
|
|
return /[^\u0061-\u007A]/.test(input);
|
|
|
|
}
|
|
|
|
|
|
|
|
private asciiDigit(input: string): boolean {
|
|
|
|
return /[^\u0030-\u0030]/.test(input);
|
|
|
|
}
|
|
|
|
|
|
|
|
private reconsumeIn(state: State): void {
|
|
|
|
this.pointer--;
|
|
|
|
this.state = state;
|
|
|
|
this.spin();
|
|
|
|
}
|
|
|
|
|
|
|
|
private parseError(error: ParseError): void {
|
|
|
|
console.error('Parse error: ' + error);
|
|
|
|
}
|
|
|
|
|
|
|
|
private consumeNext(): string | undefined {
|
|
|
|
this.currentInputCharacter = this.input[this.pointer];
|
|
|
|
this.pointer++;
|
|
|
|
|
|
|
|
return this.currentInputCharacter;
|
|
|
|
}
|
|
|
|
|
|
|
|
private next(): string | undefined {
|
|
|
|
return this.input[this.pointer];
|
|
|
|
}
|
|
|
|
|
|
|
|
private matchNextFew(input: string): boolean {
|
|
|
|
return this.input.substr(this.pointer, input.length) === input;
|
|
|
|
}
|
|
|
|
|
|
|
|
private matchNextFewCaseInsensitive(input: string): boolean {
|
|
|
|
return this.input.substr(this.pointer, input.length).toLowerCase() === input.toLowerCase();
|
|
|
|
}
|
|
|
|
|
|
|
|
private consumeNextFew(input: string): void {
|
|
|
|
for (let i = 0; i < input.length; i++) {
|
|
|
|
const consumed = this.consumeNext();
|
|
|
|
|
2021-10-24 23:11:34 +02:00
|
|
|
VERIFY(consumed === input[i], `Expected '${input[i]}' (${input} at ${i}), got ${consumed} instead`);
|
2021-10-24 22:36:38 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private consumeNextFewCaseInsensitive(input: string): void {
|
|
|
|
for (let i = 0; i < input.length; i++) {
|
|
|
|
const consumed = this.consumeNext()?.toLowerCase();
|
|
|
|
|
2021-10-24 23:11:34 +02:00
|
|
|
VERIFY(consumed === input[i].toLowerCase(), `Expected '${input[i].toLowerCase()}' (${input.toLowerCase()} at ${i}), got ${consumed} instead`);
|
2021-10-24 22:36:38 +02:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
private emit(token: Token): void {
|
|
|
|
this.tokens.push(token);
|
|
|
|
}
|
|
|
|
|
|
|
|
private emitCurrentOfType(type: Type): void {
|
2021-10-24 23:11:34 +02:00
|
|
|
VERIFY(this.currentToken.type === type, `Expected '${type}', got '${this.currentToken.type}' instead`);
|
2021-10-24 22:36:38 +02:00
|
|
|
|
|
|
|
this.tokens.push(this.currentToken);
|
|
|
|
}
|
|
|
|
|
|
|
|
private emitCurrentOfEitherType(a: Type, b: Type): void {
|
2021-10-24 23:11:34 +02:00
|
|
|
VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`);
|
2021-10-24 22:36:38 +02:00
|
|
|
|
|
|
|
this.tokens.push(this.currentToken);
|
|
|
|
}
|
|
|
|
|
|
|
|
private currentOfType<T extends Type>(type: T): Token & { type: T } {
|
2021-10-24 23:11:34 +02:00
|
|
|
VERIFY(this.currentToken.type === type, `Expected '${type}', got '${this.currentToken.type}' instead`);
|
2021-10-24 22:36:38 +02:00
|
|
|
|
|
|
|
return this.currentToken as Token & { type: T };
|
|
|
|
}
|
|
|
|
|
|
|
|
private currentOfEitherType<T extends Type, U extends Type>(a: T, b: U): Token & { type: T | U } {
|
2021-10-24 23:11:34 +02:00
|
|
|
VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`);
|
2021-10-24 22:36:38 +02:00
|
|
|
|
|
|
|
return this.currentToken as Token & { type: T };
|
|
|
|
}
|
|
|
|
|
|
|
|
private create(token: Token): Token {
|
|
|
|
return this.currentToken = token;
|
|
|
|
}
|
|
|
|
}
|