Token: Rewrite using classes, initial range implementation

This commit is contained in:
networkException 2022-01-04 00:40:12 +01:00
parent fea3ba16a9
commit cba7f2b58f
3 changed files with 383 additions and 134 deletions

View file

@ -47,7 +47,7 @@ export class Highlighter {
}
case State.Plain: {
switch (this.consumeNextTokenType()) {
case Type.Character: this.currentNode.content += this.currentTokenOfType(Type.Character).data; break;
case Type.Character: this.currentNode.content += this.currentTokenOfType(CharacterToken).data; break;
default:
this.emitNode(this.currentNode);
this.reconsumeIn(State.Undefined);
@ -56,15 +56,15 @@ export class Highlighter {
break;
}
case State.StartTag: {
switch (this.consumeNextTokenOfType(Type.StartTag).name) {
switch (this.consumeNextTokenOfType(StartTagToken).name) {
case 'script': this.returnState = State.BeforeScript; break;
default: this.returnState = State.Undefined; break;
}
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: `<` });
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Tag, content: this.currentTokenOfType(Type.StartTag).name });
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Tag, content: this.currentTokenOfType(StartTagToken).name });
if (this.currentTokenOfType(Type.StartTag).attributes.nonEmpty()) {
if (this.currentTokenOfType(StartTagToken).attributes.nonEmpty()) {
this.emitSpace({ line: 0, character: 0 });
this.reconsumeIn(State.Attributes);
}
@ -77,7 +77,7 @@ export class Highlighter {
}
case State.EndTag: {
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '</' });
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Tag, content: this.consumeNextTokenOfType(Type.EndTag).name });
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Tag, content: this.consumeNextTokenOfType(EndTagToken).name });
this.reconsumeIn(State.AfterAttributes);
@ -86,7 +86,7 @@ export class Highlighter {
break;
}
case State.Attributes: {
const attributes = this.consumeNextTokenOfEitherType(Type.StartTag, Type.EndTag).attributes.list;
const attributes = this.consumeNextTokenOfEitherType(StartTagToken, EndTagToken).attributes.list;
for (let i = 0; i < attributes.length; i++) {
const attribute = attributes[i];
@ -103,12 +103,13 @@ export class Highlighter {
case State.AfterAttributes: {
switch (this.consumeNextTokenType()) {
case Type.StartTag:
if (this.currentTokenOfType(Type.StartTag).selfClosing === undefined) {
// FIXME: StartTagToken does not support selfClosing as of now
// if (this.currentTokenOfType(StartTagToken).selfClosing === undefined) {
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '>' });
} else {
this.emitSpace({ line: 0, character: 0 });
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '/>' });
}
// } else {
// this.emitSpace({ line: 0, character: 0 });
// this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '/>' });
// }
break;
case Type.EndTag:
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '>' });
@ -132,7 +133,7 @@ export class Highlighter {
}
case State.Script: {
switch (this.consumeNextTokenType()) {
case Type.Character: this.currentNode.content += this.currentTokenOfType(Type.Character).data; break;
case Type.Character: this.currentNode.content += this.currentTokenOfType(CharacterToken).data; break;
default:
this.emitNode(this.currentNode);
this.reconsumeIn(State.Undefined);
@ -141,7 +142,7 @@ export class Highlighter {
break;
}
case State.DOCTYPE: {
const doctype = this.consumeNextTokenOfType(Type.DOCTYPE);
const doctype = this.consumeNextTokenOfType(DOCTYPEToken);
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '<!' });
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Tag, content: 'DOCTYPE' });
@ -157,7 +158,7 @@ export class Highlighter {
break;
}
case State.Comment:
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Comment, content: `<!--${this.consumeNextTokenOfType(Type.Comment).data}-->` });
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Comment, content: `<!--${this.consumeNextTokenOfType(CommentToken).data}-->` });
this.state = State.Undefined;
break;
@ -177,24 +178,24 @@ export class Highlighter {
return this.currentNode = node;
}
private consumeNextTokenOfType<T extends Type>(type: T): Token & { type: T } {
private consumeNextTokenOfType<T extends Token>(type: Constructor<T>): T {
this.currentToken = this.tokens[this.pointer];
VERIFY(this.currentToken.type === type, `Expected '${type}', got '${this.currentToken.type}' instead`);
VERIFY(this.currentToken instanceof type, `Expected '${type.name}', got '${this.currentToken.constructor.name}' instead`);
this.pointer++;
return this.currentToken as Token & { type: T };
return this.currentToken;
}
private consumeNextTokenOfEitherType<T extends Type, U extends Type>(a: T, b: U): Token & { type: T | U } {
private consumeNextTokenOfEitherType<T extends Token, U extends Token>(a: Constructor<T>, b: Constructor<U>): T | U {
this.currentToken = this.tokens[this.pointer];
VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`);
VERIFY(this.currentToken instanceof a || this.currentToken instanceof b, `Expected '${a.name}' or '${b.name}', got '${this.currentToken.constructor.name}' instead`);
this.pointer++;
return this.currentToken as Token & { type: T };
return this.currentToken;
}
private consumeNextTokenType(): Type {
@ -211,16 +212,16 @@ export class Highlighter {
return this.currentToken;
}
private currentTokenOfType<T extends Type>(type: T): Token & { type: T } {
VERIFY(this.currentToken.type === type, `Expected '${type}', got '${this.currentToken.type}' instead`);
private currentTokenOfType<T extends Token>(type: Constructor<T>): T {
VERIFY(this.currentToken instanceof type, `Expected '${type.name}', got '${this.currentToken.constructor.name}' instead`);
return this.currentToken as Token & { type: T };
return this.currentToken;
}
private currentTokenOfEitherType<T extends Type, U extends Type>(a: T, b: U): Token & { type: T | U } {
VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`);
private currentTokenOfEitherType<T extends Token, U extends Token>(a: Constructor<T>, b: Constructor<U>): T | U {
VERIFY(this.currentToken instanceof a || this.currentToken instanceof b, `Expected '${a.name}' or '${b.name}', got '${this.currentToken.constructor.name}' instead`);
return this.currentToken as Token & { type: T };
return this.currentToken;
}
private reconsumeIn(state: State): void {

View file

@ -1,8 +1,9 @@
import { TODO, VERIFY } from "../util/assertions.js";
import { TODO, VERIFY, VERIFY_NOT_REACHED } from "../util/assertions.js";
import { Constructor } from "../util/guards.js";
import { ParseError } from "./errors.js";
import { entities } from "./tokenizer/entities.js";
import { State } from "./tokenizer/state.js";
import { AttributeList, Token, Type } from "./tokenizer/token.js";
import { Attribute, CharacterToken, CommentToken, DOCTYPEToken, EndOfFileToken, EndTagToken, Position, StartTagToken, Token } from "./tokenizer/token.js";
export class Tokenizer {
private state: State = State.Data;
@ -13,6 +14,8 @@ export class Tokenizer {
private currentToken!: Token;
private currentInputCharacter!: string;
private currentPosition: Position = { line: 0, column: 0, index: 0 };
public tokens: Array<Token> = new Array<Token>();
private pointer: number = 0;
@ -30,10 +33,10 @@ export class Tokenizer {
case '\u003C': this.state = State.TagOpen; break;
case '\u0000':
this.parseError('unexpected-null-character');
this.emit({ type: Type.Character, data: this.currentInputCharacter });
this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition));
break;
case undefined: this.emit({ type: Type.EndOfFile }); break;
default: this.emit({ type: Type.Character, data: this.currentInputCharacter });
case undefined: this.emit(EndOfFileToken.create()); break;
default: this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition));
}
break;
@ -41,9 +44,9 @@ export class Tokenizer {
case State.RCDATA: {
switch (this.consumeNext()) {
case '\u003C': this.state = State.RAWTEXTLessThan; break;
case '\u0000': this.parseError('unexpected-null-character'); this.emit({ type: Type.Character, data: '\uFFFD' }); break;
case undefined: this.emit({ type: Type.EndOfFile }); break;
default: this.emit({ type: Type.Character, data: this.currentInputCharacter });
case '\u0000': this.parseError('unexpected-null-character'); this.emit(CharacterToken.createReplacementCharacter().at(this.currentPosition)); break;
case undefined: this.emit(EndOfFileToken.create()); break;
default: this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition));
}
break;
@ -54,23 +57,23 @@ export class Tokenizer {
case '\u002F': this.state = State.EndTagOpen; break;
case '\u003F':
this.parseError('unexpected-question-mark-instead-of-tag-name');
this.create({ type: Type.Comment, data: '' });
this.create(CommentToken.createEmpty().startingAt(this.currentPosition));
this.reconsumeIn(State.BogusComment);
break;
case undefined:
this.parseError('eof-before-tag-name');
this.emit({ type: Type.Character, data: '\u003C' });
this.emit({ type: Type.EndOfFile });
this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition));
this.emit(EndOfFileToken.create());
break;
default: {
if (this.asciiAlpha(this.currentInputCharacter)) {
this.create({ type: Type.StartTag, name: '', attributes: new AttributeList() });
this.create(StartTagToken.createEmpty().startingAt(this.currentPosition));
this.reconsumeIn(State.TagName);
break;
}
this.parseError('invalid-first-character-of-tag-name');
this.emit({ type: Type.Character, data: '\u003C' });
this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition));
this.reconsumeIn(State.Data);
}
}
@ -82,19 +85,19 @@ export class Tokenizer {
case '\u003E': this.parseError('missing-end-tag-name'); this.state = State.Data; break;
case undefined:
this.parseError('eof-before-tag-name');
this.emit({ type: Type.Character, data: '\u003C' });
this.emit({ type: Type.Character, data: '\u002F' });
this.emit({ type: Type.EndOfFile });
this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition));
this.emit(CharacterToken.createWith('\u002F').at(this.currentPosition));
this.emit(EndOfFileToken.create());
break;
default: {
if (this.asciiAlpha(this.currentInputCharacter)) {
this.create({ type: Type.EndTag, name: '', attributes: new AttributeList() });
this.create(EndTagToken.createEmpty().startingAt(this.currentPosition));
this.reconsumeIn(State.TagName);
break;
}
this.parseError('invalid-first-character-of-tag-name');
this.create({ type: Type.Comment, data: '' });
this.create(CommentToken.createEmpty().startingAt(this.currentPosition));
this.reconsumeIn(State.BogusComment);
}
}
@ -104,7 +107,7 @@ export class Tokenizer {
case State.MarkupDeclarationOpen: {
if (this.matchNextFew('--')) {
this.consumeNextFew('--');
this.create({ type: Type.Comment, data: '' });
this.create(CommentToken.createEmpty().startingAt(this.currentPosition));
this.state = State.CommentStart;
} else if (this.matchNextFewCaseInsensitive('DOCTYPE')) {
this.consumeNextFewCaseInsensitive('DOCTYPE');
@ -114,11 +117,11 @@ export class Tokenizer {
// NOTE: This parser will never be generated as part of the fragment parsing algorithm, as such the CDATA section state does not
// exist and will not be started here.
this.parseError('cdata-in-html-content');
this.create({ type: Type.Comment, data: '[CDATA[' });
this.create(CommentToken.createWith('[CDATA[').startingAt(this.currentPosition));
this.state = State.BogusComment;
} else {
this.parseError('incorrectly-opened-comment');
this.create({ type: Type.Comment, data: '' });
this.create(CommentToken.createEmpty().startingAt(this.currentPosition));
this.state = State.BogusComment;
}
@ -133,8 +136,8 @@ export class Tokenizer {
case '\u003E': this.reconsumeIn(State.BeforeDOCTYPEName); break;
case undefined:
this.parseError('eof-in-doctype');
this.emit({ type: Type.DOCTYPE, forceQuirks: true });
this.emit({ type: Type.EndOfFile });
this.emit(DOCTYPEToken.createWithForcedQuirks().at(this.currentPosition));
this.emit(EndOfFileToken.create());
break;
default:
this.parseError('missing-whitespace-before-doctype-name');
@ -151,22 +154,22 @@ export class Tokenizer {
case '\u0020': break;
case '\u0000':
this.parseError('unexpected-null-character');
this.create({ type: Type.DOCTYPE, name: '\uFFFD' });
this.create(DOCTYPEToken.createWithName('\uFFFD').startingAt(this.currentPosition));
this.state = State.DOCTYPEName;
break;
case undefined:
this.parseError('eof-in-doctype');
this.emit({ type: Type.DOCTYPE, forceQuirks: true });
this.emit({ type: Type.EndOfFile });
this.emit(DOCTYPEToken.createWithForcedQuirks().at(this.currentPosition));
this.emit(EndOfFileToken.create());
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter.toLowerCase()});
this.create(DOCTYPEToken.createWithName(this.currentInputCharacter.toLowerCase()).startingAt(this.currentPosition));
this.state = State.DOCTYPEName;
break;
}
this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter });
this.create(DOCTYPEToken.createWithName(this.currentInputCharacter).startingAt(this.currentPosition));
this.state = State.DOCTYPE;
}
}
@ -179,21 +182,21 @@ export class Tokenizer {
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.AfterDOCTYPEName; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.DOCTYPE); break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.DOCTYPE)!.name += '\uFFFD'; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfType(DOCTYPEToken); break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(DOCTYPEToken).appendReplacementCharacterToName(); break;
case undefined:
this.parseError('eof-in-doctype');
this.currentOfType(Type.DOCTYPE).forceQuirks = true;
this.emitCurrentOfType(Type.DOCTYPE);
this.emit({ type: Type.EndOfFile });
this.currentOfType(DOCTYPEToken).forceQuirks = true;
this.emitCurrentOfType(DOCTYPEToken);
this.emit(EndOfFileToken.create());
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter.toLowerCase();
this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter.toLowerCase());
break;
}
this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter;
this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter);
}
}
@ -206,19 +209,19 @@ export class Tokenizer {
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).name += '\uFFFD';
this.currentOfEitherType(StartTagToken, EndTagToken).appendReplacementCharacterToName();
break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter.toLowerCase();
this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter.toLowerCase());
break;
}
this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter;
this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter);
}
}
@ -235,12 +238,12 @@ export class Tokenizer {
case undefined: this.reconsumeIn(State.AfterAttributeName); break;
case '\u003D': {
this.parseError('unexpected-equals-sign-before-attribute-name');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: this.currentInputCharacter, value: '' });
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyValue(this.currentInputCharacter));
this.state = State.AttributeName;
break;
}
default: {
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' });
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyNameAndValue());
this.reconsumeIn(State.AttributeName);
}
}
@ -258,21 +261,21 @@ export class Tokenizer {
case undefined: this.reconsumeIn(State.AfterAttributeName); break;
case '\u003D': this.state = State.BeforeAttributeValue; break;
case '\u0000': this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += '\uFFFD';
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToName();
break;
case '\u0022':
case '\u0027':
case '\u003C':
this.parseError('unexpected-character-in-attribute-name');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter;
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter);
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter.toLowerCase();
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter.toLowerCase());
break;
}
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter;
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter);
}
}
@ -286,10 +289,10 @@ export class Tokenizer {
case '\u0020': break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
case '\u003D': this.state = State.BeforeAttributeValue; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break;
case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
default:
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' });
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyNameAndValue());
this.reconsumeIn(State.AttributeName);
break;
}
@ -307,7 +310,7 @@ export class Tokenizer {
case '\u003E':
this.parseError('missing-attribute-value');
this.state = State.Data;
this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag);
this.emitCurrentOfEitherType(StartTagToken, EndTagToken);
break;
default:
this.reconsumeIn(State.AttributeValueUnquoted);
@ -321,10 +324,10 @@ export class Tokenizer {
case '\u0026': this.returnState = State.AttributeValueDouble; this.state = State.CharacterReference; break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue();
break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
}
break;
@ -335,10 +338,10 @@ export class Tokenizer {
case '\u0026': this.returnState = State.AttributeValueSingle; this.state = State.CharacterReference; break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue();
break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
}
break;
@ -350,10 +353,10 @@ export class Tokenizer {
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u0026': this.returnState = State.AttributeValueUnquoted; this.state = State.CharacterReference; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break;
case '\u0000':
this.parseError('unexpected-null-character');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue();
break;
case '\u0022':
case '\u0027':
@ -361,10 +364,10 @@ export class Tokenizer {
case '\u003D':
case '\u0060':
this.parseError('unexpected-character-in-unquoted-attribute-value');
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
}
break;
@ -376,8 +379,8 @@ export class Tokenizer {
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break;
case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
default: this.parseError('missing-whitespace-between-attributes'); this.reconsumeIn(State.BeforeAttributeName);
}
@ -386,7 +389,7 @@ export class Tokenizer {
case State.CommentStart: {
switch (this.consumeNext()) {
case '\u002D': this.state = State.CommentStartDash; break;
case '\u003E': this.parseError('abrupt-closing-of-empty-comment'); this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
case '\u003E': this.parseError('abrupt-closing-of-empty-comment'); this.state = State.Data; this.emitCurrentOfType(CommentToken); break;
default: this.reconsumeIn(State.Comment);
}
@ -395,11 +398,11 @@ export class Tokenizer {
// FIXME: Possible improvement to https://html.spec.whatwg.org/multipage/parsing.html#comment-state (adding **current** in some places)
case State.Comment: {
switch (this.consumeNext()) {
case '\u003C': this.currentOfType(Type.Comment).data += this.currentInputCharacter; this.state = State.CommentLessThanSign; break;
case '\u003C': this.currentOfType(CommentToken).append(this.currentInputCharacter); this.state = State.CommentLessThanSign; break;
case '\u002D': this.state = State.CommentEndDash; break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfType(Type.Comment).data += this.currentInputCharacter;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(CommentToken).appendReplacementCharacter(); break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break;
default: this.currentOfType(CommentToken).append(this.currentInputCharacter);
}
break;
@ -407,8 +410,8 @@ export class Tokenizer {
case State.CommentEndDash: {
switch (this.consumeNext()) {
case '\u002D': this.state = State.CommentEnd; break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfType(Type.Comment).data += '\u002D'; this.reconsumeIn(State.Comment);
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break;
default: this.currentOfType(CommentToken).append('\u002D'); this.reconsumeIn(State.Comment);
}
break;
@ -416,11 +419,11 @@ export class Tokenizer {
// Same as above fixme https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
case State.CommentEnd: {
switch (this.consumeNext()) {
case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
case '\u003E': this.state = State.Data; this.emitCurrentOfType(CommentToken); break;
case '\u0021': this.state = State.CommentEndBang; break;
case '\u002D': this.currentOfType(Type.Comment).data += '\u002D'; break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
default: this.currentOfType(Type.Comment).data += '\u002D\u002D'; this.reconsumeIn(State.Comment);
case '\u002D': this.currentOfType(CommentToken).append('\u002D'); break;
case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break;
default: this.currentOfType(CommentToken).append('\u002D\u002D'); this.reconsumeIn(State.Comment);
}
break;
@ -428,10 +431,10 @@ export class Tokenizer {
// Same as above https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
case State.BogusComment: {
switch (this.consumeNext()) {
case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
case undefined: this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break;
default: this.currentOfType(Type.Comment).data += this.currentInputCharacter;
case '\u003E': this.state = State.Data; this.emitCurrentOfType(CommentToken); break;
case undefined: this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break;
case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(CommentToken).appendReplacementCharacter(); break;
default: this.currentOfType(CommentToken).append(this.currentInputCharacter);
}
break;
@ -496,9 +499,9 @@ export class Tokenizer {
default: {
if (this.asciiAlphanumeric(this.currentInputCharacter)) {
if (this.consumedAsPartOfAnAttribute()) {
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
} else {
this.emit({ type: Type.Character, data: this.currentInputCharacter });
this.emit(CharacterToken.createWith(this.currentInputCharacter));
}
break;
@ -516,12 +519,12 @@ export class Tokenizer {
private flushCodePointsConsumedAsCharacterReference(): void {
if (this.consumedAsPartOfAnAttribute()) {
this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.temporaryBuffer;
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.temporaryBuffer);
return;
}
for (const codePoint of this.temporaryBuffer)
this.emit({ type: Type.Character, data: codePoint });
this.emit(CharacterToken.createWith(codePoint));
}
private consumedAsPartOfAnAttribute(): boolean {
@ -562,6 +565,14 @@ export class Tokenizer {
this.currentInputCharacter = this.input[this.pointer];
this.pointer++;
this.currentPosition.column++;
this.currentPosition.index++;
if (this.currentInputCharacter === '\n') {
this.currentPosition.column = 0;
this.currentPosition.line++;
}
return this.currentInputCharacter;
}
@ -594,34 +605,53 @@ export class Tokenizer {
}
private emit(token: Token): void {
this.populateRangeOnEmit(token);
this.tokens.push(token);
}
private emitCurrentOfType(type: Type): void {
VERIFY(this.currentToken.type === type, `Expected '${type}', got '${this.currentToken.type}' instead`);
private emitCurrentOfType(type: Constructor<Token>): void {
VERIFY(this.currentToken instanceof type, `Expected '${type.name}', got '${this.currentToken.constructor.name}' instead`);
this.populateRangeOnEmit(this.currentToken);
this.tokens.push(this.currentToken);
}
private emitCurrentOfEitherType(a: Type, b: Type): void {
VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`);
private emitCurrentOfEitherType<T extends Token, U extends Token>(a: Constructor<T>, b: Constructor<U>): void {
VERIFY(this.currentToken instanceof a || this.currentToken instanceof b, `Expected '${a.name}' or '${b.name}', got '${this.currentToken.constructor.name}' instead`);
this.populateRangeOnEmit(this.currentToken);
this.tokens.push(this.currentToken);
}
private currentOfType<T extends Type>(type: T): Token & { type: T } {
VERIFY(this.currentToken.type === type, `Expected '${type}', got '${this.currentToken.type}' instead`);
private currentOfType<T extends Token>(type: Constructor<T>): T {
VERIFY(this.currentToken instanceof type, `Expected '${type.name}', got '${this.currentToken.constructor.name}' instead`);
return this.currentToken as Token & { type: T };
this.populateRangeOnEmit(this.currentToken);
return this.currentToken;
}
private currentOfEitherType<T extends Type, U extends Type>(a: T, b: U): Token & { type: T | U } {
VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`);
private currentOfEitherType<T extends Token, U extends Token>(a: Constructor<T>, b: Constructor<U>): T | U {
VERIFY(this.currentToken instanceof a || this.currentToken instanceof b, `Expected '${a.name}' or '${b.name}', got '${this.currentToken.constructor.name}' instead`);
return this.currentToken as Token & { type: T };
this.populateRangeOnEmit(this.currentToken);
return this.currentToken;
}
private populateRangeOnEmit(token: Token): void {
if (token.range.start === undefined && token.range.end === undefined)
token.at(this.currentPosition);
if (token.range.start !== undefined && token.range.end === undefined)
token.endingAt(this.currentPosition);
if (token.range.start === undefined && token.range.end !== undefined)
VERIFY_NOT_REACHED();
}
private create(token: Token): Token {
if (token.range.start === undefined)
token.startingAt(this.currentPosition);
return this.currentToken = token;
}
}

View file

@ -1,3 +1,5 @@
import { VERIFY, VERIFY_NOT_REACHED } from "../../util/assertions.js";
export const enum Type {
DOCTYPE = 'DOCTYPE',
StartTag = 'start tag',
@ -7,7 +9,52 @@ export const enum Type {
EndOfFile = 'end-of-file'
}
export type Attribute = { name: NonNullable<string>, value: NonNullable<string> };
export const REPLACEMENT_CHARACTER = '\uFFFD';
export type Range = {
start: Position,
end: Position
}
export type Position = {
line: number,
column: number,
index: number
}
export class Attribute {
public name: string;
public value: string;
public constructor(name: string, value: string) {
this.name = name;
this.value = value;
}
public appendToName(characters: string): void {
this.name += characters;
}
public appendReplacementCharacterToName(): void {
this.appendToName(REPLACEMENT_CHARACTER);
}
public appendToValue(characters: string): void {
this.value += characters;
}
public appendReplacementCharacterToValue(): void {
this.appendToValue(REPLACEMENT_CHARACTER);
}
public static createWithEmptyNameAndValue(): Attribute {
return new Attribute('', '');
}
public static createWithEmptyValue(name: string): Attribute {
return new Attribute(name, '');
}
}
export class AttributeList {
private attributes: Array<Attribute>;
@ -33,29 +80,200 @@ export class AttributeList {
}
}
export type Token = { type: Type.DOCTYPE, name?: string, publicIdentifier?: string, systemIdentifier?: string, forceQuirks?: true } |
{ type: Type.StartTag, name: NonNullable<string>, selfClosing?: true, attributes: AttributeList } |
{ type: Type.EndTag, name: NonNullable<string>, selfClosing?: true, attributes: AttributeList } |
{ type: Type.Comment, data: NonNullable<string> } |
{ type: Type.Character, data: NonNullable<string> } |
{ type: Type.EndOfFile };
export abstract class Token {
#type: Type;
#range!: Range;
protected constructor(type: Type) {
this.#type = type;
// @ts-expect-error
this.#range = {};
}
public startingAt(position: Position): this {
this.#range.start = { line: position.line, column: position.column, index: position.index };
return this;
}
public endingAt(position: Position): this {
this.#range.end = { line: position.line, column: position.column, index: position.index };
return this;
}
public at(position: Position): this {
this.#range.start = { line: position.line, column: position.column, index: position.index };
this.#range.end = { line: position.line, column: position.column, index: position.index };
return this;
}
public get range(): Range {
return this.#range;
}
public get type(): Type {
return this.#type;
}
}
export class CharacterToken extends Token {
public readonly data: NonNullable<string>;
public constructor(data: NonNullable<string>) {
super(Type.Character);
this.data = data;
}
public static createWith(data: NonNullable<string>): CharacterToken {
return new CharacterToken(data);
}
public static createReplacementCharacter(): CharacterToken {
return new CharacterToken(REPLACEMENT_CHARACTER);
}
}
export class CommentToken extends Token {
public data: NonNullable<string>;
public constructor(data: NonNullable<string>) {
super(Type.Comment);
this.data = data;
}
public append(characters: string): void {
this.data += characters;
}
public appendReplacementCharacter(): void {
this.append(REPLACEMENT_CHARACTER);
}
public static createEmpty(): CommentToken {
return new CommentToken('');
}
public static createWith(data: string): CommentToken {
return new CommentToken(data);
}
}
export class EndOfFileToken extends Token {
public constructor() {
super(Type.EndOfFile);
}
public static create(): EndOfFileToken {
return new EndOfFileToken();
}
}
export class StartTagToken extends Token {
public name: NonNullable<string>;
public readonly attributes: AttributeList;
public constructor(name: NonNullable<string>, attributes: AttributeList) {
super(Type.StartTag);
this.name = name;
this.attributes = attributes;
}
public appendToName(characters: string): void {
this.name += characters;
}
public appendReplacementCharacterToName(): void {
this.appendToName(REPLACEMENT_CHARACTER);
}
public static createEmpty(): StartTagToken {
return new StartTagToken('', new AttributeList());
}
}
export class EndTagToken extends Token {
public name: NonNullable<string>;
public readonly attributes: AttributeList;
public constructor(name: NonNullable<string>, attributes: AttributeList) {
super(Type.EndTag);
this.name = name;
this.attributes = attributes;
}
public appendToName(characters: string): void {
this.name += characters;
}
public appendReplacementCharacterToName(): void {
this.appendToName(REPLACEMENT_CHARACTER);
}
public static createEmpty(): EndTagToken {
return new EndTagToken('', new AttributeList());
}
}
export class DOCTYPEToken extends Token {
public name?: string;
public publicIdentifier?: string;
public systemIdentifier?: string;
public forceQuirks?: true;
public constructor(name?: string, publicIdentifier?: string, systemIdentifier?: string, forceQuirks?: true) {
super(Type.DOCTYPE);
this.name = name;
this.publicIdentifier = publicIdentifier;
this.systemIdentifier = systemIdentifier;
this.forceQuirks = forceQuirks;
}
public appendToName(characters: string): void {
VERIFY(this.name !== undefined);
this.name += characters;
}
public appendReplacementCharacterToName(): void {
this.appendToName(REPLACEMENT_CHARACTER);
}
public static createWithForcedQuirks(): DOCTYPEToken {
return new DOCTYPEToken(undefined, undefined, undefined, true);
}
public static createWithName(name: string): DOCTYPEToken {
return new DOCTYPEToken(name, undefined, undefined, undefined);
}
}
export function stringify(token: Token): string {
switch (token.type) {
case Type.Character: return token.data;
case Type.Comment: return `<!--${token.data}-->`;
case Type.DOCTYPE: return `<!DOCTYPE ${token.name}>`;
case Type.EndOfFile: return 'EOF';
case Type.EndTag: return `</${token.name}>`;
case Type.StartTag: {
if (token instanceof CharacterToken) return token.data;
if (token instanceof CommentToken) return `<!--${token.data}-->`;
if (token instanceof DOCTYPEToken) return `<!DOCTYPE ${token.name}>`;
if (token instanceof EndOfFileToken) return 'EOF';
if (token instanceof EndTagToken) return `</${token.name}>`;
if (token instanceof StartTagToken) {
let string = `<${token.name}`;
for (const attribute of token.attributes.list)
string += ` ${attribute.name}="${attribute.value}"`;
if (token.selfClosing) return `${string} />`;
// TODO: Implemement selfClosing
// if (token.selfClosing) return `${string} />`;
return `${string}>`;
};
}
VERIFY_NOT_REACHED(token.constructor.name);
return '';
}