diff --git a/html/highlighter.ts b/html/highlighter.ts
index 6e6b650..b34222f 100644
--- a/html/highlighter.ts
+++ b/html/highlighter.ts
@@ -47,7 +47,7 @@ export class Highlighter {
}
case State.Plain: {
switch (this.consumeNextTokenType()) {
- case Type.Character: this.currentNode.content += this.currentTokenOfType(Type.Character).data; break;
+ case Type.Character: this.currentNode.content += this.currentTokenOfType(CharacterToken).data; break;
default:
this.emitNode(this.currentNode);
this.reconsumeIn(State.Undefined);
@@ -56,15 +56,15 @@ export class Highlighter {
break;
}
case State.StartTag: {
- switch (this.consumeNextTokenOfType(Type.StartTag).name) {
+ switch (this.consumeNextTokenOfType(StartTagToken).name) {
case 'script': this.returnState = State.BeforeScript; break;
default: this.returnState = State.Undefined; break;
}
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: `<` });
- this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Tag, content: this.currentTokenOfType(Type.StartTag).name });
+ this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Tag, content: this.currentTokenOfType(StartTagToken).name });
- if (this.currentTokenOfType(Type.StartTag).attributes.nonEmpty()) {
+ if (this.currentTokenOfType(StartTagToken).attributes.nonEmpty()) {
this.emitSpace({ line: 0, character: 0 });
this.reconsumeIn(State.Attributes);
}
@@ -77,7 +77,7 @@ export class Highlighter {
}
case State.EndTag: {
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '' });
- this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Tag, content: this.consumeNextTokenOfType(Type.EndTag).name });
+ this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Tag, content: this.consumeNextTokenOfType(EndTagToken).name });
this.reconsumeIn(State.AfterAttributes);
@@ -86,7 +86,7 @@ export class Highlighter {
break;
}
case State.Attributes: {
- const attributes = this.consumeNextTokenOfEitherType(Type.StartTag, Type.EndTag).attributes.list;
+ const attributes = this.consumeNextTokenOfEitherType(StartTagToken, EndTagToken).attributes.list;
for (let i = 0; i < attributes.length; i++) {
const attribute = attributes[i];
@@ -103,12 +103,13 @@ export class Highlighter {
case State.AfterAttributes: {
switch (this.consumeNextTokenType()) {
case Type.StartTag:
- if (this.currentTokenOfType(Type.StartTag).selfClosing === undefined) {
+ // FIXME: StartTagToken does not support selfClosing as of now
+ // if (this.currentTokenOfType(StartTagToken).selfClosing === undefined) {
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '>' });
- } else {
- this.emitSpace({ line: 0, character: 0 });
- this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '/>' });
- }
+ // } else {
+ // this.emitSpace({ line: 0, character: 0 });
+ // this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '/>' });
+ // }
break;
case Type.EndTag:
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '>' });
@@ -132,7 +133,7 @@ export class Highlighter {
}
case State.Script: {
switch (this.consumeNextTokenType()) {
- case Type.Character: this.currentNode.content += this.currentTokenOfType(Type.Character).data; break;
+ case Type.Character: this.currentNode.content += this.currentTokenOfType(CharacterToken).data; break;
default:
this.emitNode(this.currentNode);
this.reconsumeIn(State.Undefined);
@@ -141,7 +142,7 @@ export class Highlighter {
break;
}
case State.DOCTYPE: {
- const doctype = this.consumeNextTokenOfType(Type.DOCTYPE);
+ const doctype = this.consumeNextTokenOfType(DOCTYPEToken);
this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '` });
+ this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Comment, content: `` });
this.state = State.Undefined;
break;
@@ -177,24 +178,24 @@ export class Highlighter {
return this.currentNode = node;
}
- private consumeNextTokenOfType(type: T): Token & { type: T } {
+ private consumeNextTokenOfType(type: Constructor): T {
this.currentToken = this.tokens[this.pointer];
- VERIFY(this.currentToken.type === type, `Expected '${type}', got '${this.currentToken.type}' instead`);
+ VERIFY(this.currentToken instanceof type, `Expected '${type.name}', got '${this.currentToken.constructor.name}' instead`);
this.pointer++;
- return this.currentToken as Token & { type: T };
+ return this.currentToken;
}
- private consumeNextTokenOfEitherType(a: T, b: U): Token & { type: T | U } {
+ private consumeNextTokenOfEitherType(a: Constructor, b: Constructor): T | U {
this.currentToken = this.tokens[this.pointer];
- VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`);
+ VERIFY(this.currentToken instanceof a || this.currentToken instanceof b, `Expected '${a.name}' or '${b.name}', got '${this.currentToken.constructor.name}' instead`);
this.pointer++;
- return this.currentToken as Token & { type: T };
+ return this.currentToken;
}
private consumeNextTokenType(): Type {
@@ -211,16 +212,16 @@ export class Highlighter {
return this.currentToken;
}
- private currentTokenOfType(type: T): Token & { type: T } {
- VERIFY(this.currentToken.type === type, `Expected '${type}', got '${this.currentToken.type}' instead`);
+ private currentTokenOfType(type: Constructor): T {
+ VERIFY(this.currentToken instanceof type, `Expected '${type.name}', got '${this.currentToken.constructor.name}' instead`);
- return this.currentToken as Token & { type: T };
+ return this.currentToken;
}
- private currentTokenOfEitherType(a: T, b: U): Token & { type: T | U } {
- VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`);
+ private currentTokenOfEitherType(a: Constructor, b: Constructor): T | U {
+ VERIFY(this.currentToken instanceof a || this.currentToken instanceof b, `Expected '${a.name}' or '${b.name}', got '${this.currentToken.constructor.name}' instead`);
- return this.currentToken as Token & { type: T };
+ return this.currentToken;
}
private reconsumeIn(state: State): void {
diff --git a/html/tokenizer.ts b/html/tokenizer.ts
index 98476fe..6709751 100644
--- a/html/tokenizer.ts
+++ b/html/tokenizer.ts
@@ -1,8 +1,9 @@
-import { TODO, VERIFY } from "../util/assertions.js";
+import { TODO, VERIFY, VERIFY_NOT_REACHED } from "../util/assertions.js";
+import { Constructor } from "../util/guards.js";
import { ParseError } from "./errors.js";
import { entities } from "./tokenizer/entities.js";
import { State } from "./tokenizer/state.js";
-import { AttributeList, Token, Type } from "./tokenizer/token.js";
+import { Attribute, CharacterToken, CommentToken, DOCTYPEToken, EndOfFileToken, EndTagToken, Position, StartTagToken, Token } from "./tokenizer/token.js";
export class Tokenizer {
private state: State = State.Data;
@@ -13,6 +14,8 @@ export class Tokenizer {
private currentToken!: Token;
private currentInputCharacter!: string;
+ private currentPosition: Position = { line: 0, column: 0, index: 0 };
+
public tokens: Array = new Array();
private pointer: number = 0;
@@ -30,10 +33,10 @@ export class Tokenizer {
case '\u003C': this.state = State.TagOpen; break;
case '\u0000':
this.parseError('unexpected-null-character');
- this.emit({ type: Type.Character, data: this.currentInputCharacter });
+ this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition));
break;
- case undefined: this.emit({ type: Type.EndOfFile }); break;
- default: this.emit({ type: Type.Character, data: this.currentInputCharacter });
+ case undefined: this.emit(EndOfFileToken.create()); break;
+ default: this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition));
}
break;
@@ -41,9 +44,9 @@ export class Tokenizer {
case State.RCDATA: {
switch (this.consumeNext()) {
case '\u003C': this.state = State.RAWTEXTLessThan; break;
- case '\u0000': this.parseError('unexpected-null-character'); this.emit({ type: Type.Character, data: '\uFFFD' }); break;
- case undefined: this.emit({ type: Type.EndOfFile }); break;
- default: this.emit({ type: Type.Character, data: this.currentInputCharacter });
+ case '\u0000': this.parseError('unexpected-null-character'); this.emit(CharacterToken.createReplacementCharacter().at(this.currentPosition)); break;
+ case undefined: this.emit(EndOfFileToken.create()); break;
+ default: this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition));
}
break;
@@ -54,23 +57,23 @@ export class Tokenizer {
case '\u002F': this.state = State.EndTagOpen; break;
case '\u003F':
this.parseError('unexpected-question-mark-instead-of-tag-name');
- this.create({ type: Type.Comment, data: '' });
+ this.create(CommentToken.createEmpty().startingAt(this.currentPosition));
this.reconsumeIn(State.BogusComment);
break;
case undefined:
this.parseError('eof-before-tag-name');
- this.emit({ type: Type.Character, data: '\u003C' });
- this.emit({ type: Type.EndOfFile });
+ this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition));
+ this.emit(EndOfFileToken.create());
break;
default: {
if (this.asciiAlpha(this.currentInputCharacter)) {
- this.create({ type: Type.StartTag, name: '', attributes: new AttributeList() });
+ this.create(StartTagToken.createEmpty().startingAt(this.currentPosition));
this.reconsumeIn(State.TagName);
break;
}
this.parseError('invalid-first-character-of-tag-name');
- this.emit({ type: Type.Character, data: '\u003C' });
+ this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition));
this.reconsumeIn(State.Data);
}
}
@@ -82,19 +85,19 @@ export class Tokenizer {
case '\u003E': this.parseError('missing-end-tag-name'); this.state = State.Data; break;
case undefined:
this.parseError('eof-before-tag-name');
- this.emit({ type: Type.Character, data: '\u003C' });
- this.emit({ type: Type.Character, data: '\u002F' });
- this.emit({ type: Type.EndOfFile });
+ this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition));
+ this.emit(CharacterToken.createWith('\u002F').at(this.currentPosition));
+ this.emit(EndOfFileToken.create());
break;
default: {
if (this.asciiAlpha(this.currentInputCharacter)) {
- this.create({ type: Type.EndTag, name: '', attributes: new AttributeList() });
+ this.create(EndTagToken.createEmpty().startingAt(this.currentPosition));
this.reconsumeIn(State.TagName);
break;
}
this.parseError('invalid-first-character-of-tag-name');
- this.create({ type: Type.Comment, data: '' });
+ this.create(CommentToken.createEmpty().startingAt(this.currentPosition));
this.reconsumeIn(State.BogusComment);
}
}
@@ -104,7 +107,7 @@ export class Tokenizer {
case State.MarkupDeclarationOpen: {
if (this.matchNextFew('--')) {
this.consumeNextFew('--');
- this.create({ type: Type.Comment, data: '' });
+ this.create(CommentToken.createEmpty().startingAt(this.currentPosition));
this.state = State.CommentStart;
} else if (this.matchNextFewCaseInsensitive('DOCTYPE')) {
this.consumeNextFewCaseInsensitive('DOCTYPE');
@@ -114,11 +117,11 @@ export class Tokenizer {
// NOTE: This parser will never be generated as part of the fragment parsing algorithm, as such the CDATA section state does not
// exist and will not be started here.
this.parseError('cdata-in-html-content');
- this.create({ type: Type.Comment, data: '[CDATA[' });
+ this.create(CommentToken.createWith('[CDATA[').startingAt(this.currentPosition));
this.state = State.BogusComment;
} else {
this.parseError('incorrectly-opened-comment');
- this.create({ type: Type.Comment, data: '' });
+ this.create(CommentToken.createEmpty().startingAt(this.currentPosition));
this.state = State.BogusComment;
}
@@ -133,8 +136,8 @@ export class Tokenizer {
case '\u003E': this.reconsumeIn(State.BeforeDOCTYPEName); break;
case undefined:
this.parseError('eof-in-doctype');
- this.emit({ type: Type.DOCTYPE, forceQuirks: true });
- this.emit({ type: Type.EndOfFile });
+ this.emit(DOCTYPEToken.createWithForcedQuirks().at(this.currentPosition));
+ this.emit(EndOfFileToken.create());
break;
default:
this.parseError('missing-whitespace-before-doctype-name');
@@ -151,22 +154,22 @@ export class Tokenizer {
case '\u0020': break;
case '\u0000':
this.parseError('unexpected-null-character');
- this.create({ type: Type.DOCTYPE, name: '\uFFFD' });
+ this.create(DOCTYPEToken.createWithName('\uFFFD').startingAt(this.currentPosition));
this.state = State.DOCTYPEName;
break;
case undefined:
this.parseError('eof-in-doctype');
- this.emit({ type: Type.DOCTYPE, forceQuirks: true });
- this.emit({ type: Type.EndOfFile });
+ this.emit(DOCTYPEToken.createWithForcedQuirks().at(this.currentPosition));
+ this.emit(EndOfFileToken.create());
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
- this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter.toLowerCase()});
+ this.create(DOCTYPEToken.createWithName(this.currentInputCharacter.toLowerCase()).startingAt(this.currentPosition));
this.state = State.DOCTYPEName;
break;
}
- this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter });
+ this.create(DOCTYPEToken.createWithName(this.currentInputCharacter).startingAt(this.currentPosition));
this.state = State.DOCTYPE;
}
}
@@ -179,21 +182,21 @@ export class Tokenizer {
case '\u000A':
case '\u000C':
case '\u0020': this.state = State.AfterDOCTYPEName; break;
- case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.DOCTYPE); break;
- case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.DOCTYPE)!.name += '\uFFFD'; break;
+ case '\u003E': this.state = State.Data; this.emitCurrentOfType(DOCTYPEToken); break;
+ case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(DOCTYPEToken).appendReplacementCharacterToName(); break;
case undefined:
this.parseError('eof-in-doctype');
- this.currentOfType(Type.DOCTYPE).forceQuirks = true;
- this.emitCurrentOfType(Type.DOCTYPE);
- this.emit({ type: Type.EndOfFile });
+ this.currentOfType(DOCTYPEToken).forceQuirks = true;
+ this.emitCurrentOfType(DOCTYPEToken);
+ this.emit(EndOfFileToken.create());
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
- this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter.toLowerCase();
+ this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter.toLowerCase());
break;
}
- this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter;
+ this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter);
}
}
@@ -206,19 +209,19 @@ export class Tokenizer {
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
- case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
+ case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break;
case '\u0000':
this.parseError('unexpected-null-character');
- this.currentOfEitherType(Type.StartTag, Type.EndTag).name += '\uFFFD';
+ this.currentOfEitherType(StartTagToken, EndTagToken).appendReplacementCharacterToName();
break;
- case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
+ case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
- this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter.toLowerCase();
+ this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter.toLowerCase());
break;
}
- this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter;
+ this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter);
}
}
@@ -235,12 +238,12 @@ export class Tokenizer {
case undefined: this.reconsumeIn(State.AfterAttributeName); break;
case '\u003D': {
this.parseError('unexpected-equals-sign-before-attribute-name');
- this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: this.currentInputCharacter, value: '' });
+ this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyValue(this.currentInputCharacter));
this.state = State.AttributeName;
break;
}
default: {
- this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' });
+ this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyNameAndValue());
this.reconsumeIn(State.AttributeName);
}
}
@@ -258,21 +261,21 @@ export class Tokenizer {
case undefined: this.reconsumeIn(State.AfterAttributeName); break;
case '\u003D': this.state = State.BeforeAttributeValue; break;
case '\u0000': this.parseError('unexpected-null-character');
- this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += '\uFFFD';
+ this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToName();
break;
case '\u0022':
case '\u0027':
case '\u003C':
this.parseError('unexpected-character-in-attribute-name');
- this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter;
+ this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter);
break;
default: {
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
- this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter.toLowerCase();
+ this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter.toLowerCase());
break;
}
- this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter;
+ this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter);
}
}
@@ -286,10 +289,10 @@ export class Tokenizer {
case '\u0020': break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
case '\u003D': this.state = State.BeforeAttributeValue; break;
- case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
- case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
+ case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break;
+ case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
default:
- this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' });
+ this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyNameAndValue());
this.reconsumeIn(State.AttributeName);
break;
}
@@ -307,7 +310,7 @@ export class Tokenizer {
case '\u003E':
this.parseError('missing-attribute-value');
this.state = State.Data;
- this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag);
+ this.emitCurrentOfEitherType(StartTagToken, EndTagToken);
break;
default:
this.reconsumeIn(State.AttributeValueUnquoted);
@@ -321,10 +324,10 @@ export class Tokenizer {
case '\u0026': this.returnState = State.AttributeValueDouble; this.state = State.CharacterReference; break;
case '\u0000':
this.parseError('unexpected-null-character');
- this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
+ this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue();
break;
- case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
- default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
+ case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
+ default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
}
break;
@@ -335,10 +338,10 @@ export class Tokenizer {
case '\u0026': this.returnState = State.AttributeValueSingle; this.state = State.CharacterReference; break;
case '\u0000':
this.parseError('unexpected-null-character');
- this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
+ this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue();
break;
- case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
- default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
+ case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
+ default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
}
break;
@@ -350,10 +353,10 @@ export class Tokenizer {
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u0026': this.returnState = State.AttributeValueUnquoted; this.state = State.CharacterReference; break;
- case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
+ case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break;
case '\u0000':
this.parseError('unexpected-null-character');
- this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD';
+ this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue();
break;
case '\u0022':
case '\u0027':
@@ -361,10 +364,10 @@ export class Tokenizer {
case '\u003D':
case '\u0060':
this.parseError('unexpected-character-in-unquoted-attribute-value');
- this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
+ this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
break;
- case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
- default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
+ case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
+ default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
}
break;
@@ -376,8 +379,8 @@ export class Tokenizer {
case '\u000C':
case '\u0020': this.state = State.BeforeAttributeName; break;
case '\u002F': this.state = State.SelfClosingStartTag; break;
- case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break;
- case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break;
+ case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break;
+ case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
default: this.parseError('missing-whitespace-between-attributes'); this.reconsumeIn(State.BeforeAttributeName);
}
@@ -386,7 +389,7 @@ export class Tokenizer {
case State.CommentStart: {
switch (this.consumeNext()) {
case '\u002D': this.state = State.CommentStartDash; break;
- case '\u003E': this.parseError('abrupt-closing-of-empty-comment'); this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
+ case '\u003E': this.parseError('abrupt-closing-of-empty-comment'); this.state = State.Data; this.emitCurrentOfType(CommentToken); break;
default: this.reconsumeIn(State.Comment);
}
@@ -395,11 +398,11 @@ export class Tokenizer {
// FIXME: Possible improvement to https://html.spec.whatwg.org/multipage/parsing.html#comment-state (adding **current** in some places)
case State.Comment: {
switch (this.consumeNext()) {
- case '\u003C': this.currentOfType(Type.Comment).data += this.currentInputCharacter; this.state = State.CommentLessThanSign; break;
+ case '\u003C': this.currentOfType(CommentToken).append(this.currentInputCharacter); this.state = State.CommentLessThanSign; break;
case '\u002D': this.state = State.CommentEndDash; break;
- case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break;
- case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
- default: this.currentOfType(Type.Comment).data += this.currentInputCharacter;
+ case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(CommentToken).appendReplacementCharacter(); break;
+ case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break;
+ default: this.currentOfType(CommentToken).append(this.currentInputCharacter);
}
break;
@@ -407,8 +410,8 @@ export class Tokenizer {
case State.CommentEndDash: {
switch (this.consumeNext()) {
case '\u002D': this.state = State.CommentEnd; break;
- case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
- default: this.currentOfType(Type.Comment).data += '\u002D'; this.reconsumeIn(State.Comment);
+ case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break;
+ default: this.currentOfType(CommentToken).append('\u002D'); this.reconsumeIn(State.Comment);
}
break;
@@ -416,11 +419,11 @@ export class Tokenizer {
// Same as above fixme https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
case State.CommentEnd: {
switch (this.consumeNext()) {
- case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
+ case '\u003E': this.state = State.Data; this.emitCurrentOfType(CommentToken); break;
case '\u0021': this.state = State.CommentEndBang; break;
- case '\u002D': this.currentOfType(Type.Comment).data += '\u002D'; break;
- case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
- default: this.currentOfType(Type.Comment).data += '\u002D\u002D'; this.reconsumeIn(State.Comment);
+ case '\u002D': this.currentOfType(CommentToken).append('\u002D'); break;
+ case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break;
+ default: this.currentOfType(CommentToken).append('\u002D\u002D'); this.reconsumeIn(State.Comment);
}
break;
@@ -428,10 +431,10 @@ export class Tokenizer {
// Same as above https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
case State.BogusComment: {
switch (this.consumeNext()) {
- case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break;
- case undefined: this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break;
- case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break;
- default: this.currentOfType(Type.Comment).data += this.currentInputCharacter;
+ case '\u003E': this.state = State.Data; this.emitCurrentOfType(CommentToken); break;
+ case undefined: this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break;
+ case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(CommentToken).appendReplacementCharacter(); break;
+ default: this.currentOfType(CommentToken).append(this.currentInputCharacter);
}
break;
@@ -496,9 +499,9 @@ export class Tokenizer {
default: {
if (this.asciiAlphanumeric(this.currentInputCharacter)) {
if (this.consumedAsPartOfAnAttribute()) {
- this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter;
+ this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
} else {
- this.emit({ type: Type.Character, data: this.currentInputCharacter });
+ this.emit(CharacterToken.createWith(this.currentInputCharacter));
}
break;
@@ -516,12 +519,12 @@ export class Tokenizer {
private flushCodePointsConsumedAsCharacterReference(): void {
if (this.consumedAsPartOfAnAttribute()) {
- this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.temporaryBuffer;
+ this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.temporaryBuffer);
return;
}
for (const codePoint of this.temporaryBuffer)
- this.emit({ type: Type.Character, data: codePoint });
+ this.emit(CharacterToken.createWith(codePoint));
}
private consumedAsPartOfAnAttribute(): boolean {
@@ -562,6 +565,14 @@ export class Tokenizer {
this.currentInputCharacter = this.input[this.pointer];
this.pointer++;
+ this.currentPosition.column++;
+ this.currentPosition.index++;
+
+ if (this.currentInputCharacter === '\n') {
+ this.currentPosition.column = 0;
+ this.currentPosition.line++;
+ }
+
return this.currentInputCharacter;
}
@@ -594,34 +605,53 @@ export class Tokenizer {
}
private emit(token: Token): void {
+ this.populateRangeOnEmit(token);
this.tokens.push(token);
}
- private emitCurrentOfType(type: Type): void {
- VERIFY(this.currentToken.type === type, `Expected '${type}', got '${this.currentToken.type}' instead`);
+ private emitCurrentOfType(type: Constructor): void {
+ VERIFY(this.currentToken instanceof type, `Expected '${type.name}', got '${this.currentToken.constructor.name}' instead`);
+ this.populateRangeOnEmit(this.currentToken);
this.tokens.push(this.currentToken);
}
- private emitCurrentOfEitherType(a: Type, b: Type): void {
- VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`);
+ private emitCurrentOfEitherType(a: Constructor, b: Constructor): void {
+ VERIFY(this.currentToken instanceof a || this.currentToken instanceof b, `Expected '${a.name}' or '${b.name}', got '${this.currentToken.constructor.name}' instead`);
+ this.populateRangeOnEmit(this.currentToken);
this.tokens.push(this.currentToken);
}
- private currentOfType(type: T): Token & { type: T } {
- VERIFY(this.currentToken.type === type, `Expected '${type}', got '${this.currentToken.type}' instead`);
+ private currentOfType(type: Constructor): T {
+ VERIFY(this.currentToken instanceof type, `Expected '${type.name}', got '${this.currentToken.constructor.name}' instead`);
- return this.currentToken as Token & { type: T };
+ this.populateRangeOnEmit(this.currentToken);
+ return this.currentToken;
}
- private currentOfEitherType(a: T, b: U): Token & { type: T | U } {
- VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`);
+ private currentOfEitherType(a: Constructor, b: Constructor): T | U {
+ VERIFY(this.currentToken instanceof a || this.currentToken instanceof b, `Expected '${a.name}' or '${b.name}', got '${this.currentToken.constructor.name}' instead`);
- return this.currentToken as Token & { type: T };
+ this.populateRangeOnEmit(this.currentToken);
+ return this.currentToken;
+ }
+
+ private populateRangeOnEmit(token: Token): void {
+ if (token.range.start === undefined && token.range.end === undefined)
+ token.at(this.currentPosition);
+
+ if (token.range.start !== undefined && token.range.end === undefined)
+ token.endingAt(this.currentPosition);
+
+ if (token.range.start === undefined && token.range.end !== undefined)
+ VERIFY_NOT_REACHED();
}
private create(token: Token): Token {
+ if (token.range.start === undefined)
+ token.startingAt(this.currentPosition);
+
return this.currentToken = token;
}
}
diff --git a/html/tokenizer/token.ts b/html/tokenizer/token.ts
index 1c9a789..ea1aced 100644
--- a/html/tokenizer/token.ts
+++ b/html/tokenizer/token.ts
@@ -1,3 +1,5 @@
+import { VERIFY, VERIFY_NOT_REACHED } from "../../util/assertions.js";
+
export const enum Type {
DOCTYPE = 'DOCTYPE',
StartTag = 'start tag',
@@ -7,7 +9,52 @@ export const enum Type {
EndOfFile = 'end-of-file'
}
-export type Attribute = { name: NonNullable, value: NonNullable };
+export const REPLACEMENT_CHARACTER = '\uFFFD';
+
+export type Range = {
+ start: Position,
+ end: Position
+}
+
+export type Position = {
+ line: number,
+ column: number,
+ index: number
+}
+
+export class Attribute {
+ public name: string;
+ public value: string;
+
+ public constructor(name: string, value: string) {
+ this.name = name;
+ this.value = value;
+ }
+
+ public appendToName(characters: string): void {
+ this.name += characters;
+ }
+
+ public appendReplacementCharacterToName(): void {
+ this.appendToName(REPLACEMENT_CHARACTER);
+ }
+
+ public appendToValue(characters: string): void {
+ this.value += characters;
+ }
+
+ public appendReplacementCharacterToValue(): void {
+ this.appendToValue(REPLACEMENT_CHARACTER);
+ }
+
+ public static createWithEmptyNameAndValue(): Attribute {
+ return new Attribute('', '');
+ }
+
+ public static createWithEmptyValue(name: string): Attribute {
+ return new Attribute(name, '');
+ }
+}
export class AttributeList {
private attributes: Array;
@@ -33,29 +80,200 @@ export class AttributeList {
}
}
-export type Token = { type: Type.DOCTYPE, name?: string, publicIdentifier?: string, systemIdentifier?: string, forceQuirks?: true } |
- { type: Type.StartTag, name: NonNullable, selfClosing?: true, attributes: AttributeList } |
- { type: Type.EndTag, name: NonNullable, selfClosing?: true, attributes: AttributeList } |
- { type: Type.Comment, data: NonNullable } |
- { type: Type.Character, data: NonNullable } |
- { type: Type.EndOfFile };
+export abstract class Token {
+ #type: Type;
+ #range!: Range;
+
+ protected constructor(type: Type) {
+ this.#type = type;
+
+ // @ts-expect-error
+ this.#range = {};
+ }
+
+ public startingAt(position: Position): this {
+ this.#range.start = { line: position.line, column: position.column, index: position.index };
+
+ return this;
+ }
+
+ public endingAt(position: Position): this {
+ this.#range.end = { line: position.line, column: position.column, index: position.index };
+
+ return this;
+ }
+
+ public at(position: Position): this {
+ this.#range.start = { line: position.line, column: position.column, index: position.index };
+ this.#range.end = { line: position.line, column: position.column, index: position.index };
+
+ return this;
+ }
+
+ public get range(): Range {
+ return this.#range;
+ }
+
+ public get type(): Type {
+ return this.#type;
+ }
+}
+
+export class CharacterToken extends Token {
+ public readonly data: NonNullable;
+
+ public constructor(data: NonNullable) {
+ super(Type.Character);
+
+ this.data = data;
+ }
+
+ public static createWith(data: NonNullable): CharacterToken {
+ return new CharacterToken(data);
+ }
+
+ public static createReplacementCharacter(): CharacterToken {
+ return new CharacterToken(REPLACEMENT_CHARACTER);
+ }
+}
+
+export class CommentToken extends Token {
+ public data: NonNullable;
+
+ public constructor(data: NonNullable) {
+ super(Type.Comment);
+
+ this.data = data;
+ }
+
+ public append(characters: string): void {
+ this.data += characters;
+ }
+
+ public appendReplacementCharacter(): void {
+ this.append(REPLACEMENT_CHARACTER);
+ }
+
+ public static createEmpty(): CommentToken {
+ return new CommentToken('');
+ }
+
+ public static createWith(data: string): CommentToken {
+ return new CommentToken(data);
+ }
+}
+
+export class EndOfFileToken extends Token {
+ public constructor() {
+ super(Type.EndOfFile);
+ }
+
+ public static create(): EndOfFileToken {
+ return new EndOfFileToken();
+ }
+}
+
+export class StartTagToken extends Token {
+ public name: NonNullable;
+ public readonly attributes: AttributeList;
+
+ public constructor(name: NonNullable, attributes: AttributeList) {
+ super(Type.StartTag);
+
+ this.name = name;
+ this.attributes = attributes;
+ }
+
+ public appendToName(characters: string): void {
+ this.name += characters;
+ }
+
+ public appendReplacementCharacterToName(): void {
+ this.appendToName(REPLACEMENT_CHARACTER);
+ }
+
+ public static createEmpty(): StartTagToken {
+ return new StartTagToken('', new AttributeList());
+ }
+}
+
+export class EndTagToken extends Token {
+ public name: NonNullable;
+ public readonly attributes: AttributeList;
+
+ public constructor(name: NonNullable, attributes: AttributeList) {
+ super(Type.EndTag);
+
+ this.name = name;
+ this.attributes = attributes;
+ }
+
+ public appendToName(characters: string): void {
+ this.name += characters;
+ }
+
+ public appendReplacementCharacterToName(): void {
+ this.appendToName(REPLACEMENT_CHARACTER);
+ }
+
+ public static createEmpty(): EndTagToken {
+ return new EndTagToken('', new AttributeList());
+ }
+}
+
+export class DOCTYPEToken extends Token {
+ public name?: string;
+ public publicIdentifier?: string;
+ public systemIdentifier?: string;
+ public forceQuirks?: true;
+
+ public constructor(name?: string, publicIdentifier?: string, systemIdentifier?: string, forceQuirks?: true) {
+ super(Type.DOCTYPE);
+
+ this.name = name;
+ this.publicIdentifier = publicIdentifier;
+ this.systemIdentifier = systemIdentifier;
+ this.forceQuirks = forceQuirks;
+ }
+
+ public appendToName(characters: string): void {
+ VERIFY(this.name !== undefined);
+
+ this.name += characters;
+ }
+
+ public appendReplacementCharacterToName(): void {
+ this.appendToName(REPLACEMENT_CHARACTER);
+ }
+
+ public static createWithForcedQuirks(): DOCTYPEToken {
+ return new DOCTYPEToken(undefined, undefined, undefined, true);
+ }
+
+ public static createWithName(name: string): DOCTYPEToken {
+ return new DOCTYPEToken(name, undefined, undefined, undefined);
+ }
+}
export function stringify(token: Token): string {
- switch (token.type) {
- case Type.Character: return token.data;
- case Type.Comment: return ``;
- case Type.DOCTYPE: return ``;
- case Type.EndOfFile: return 'EOF';
- case Type.EndTag: return `${token.name}>`;
- case Type.StartTag: {
+ if (token instanceof CharacterToken) return token.data;
+ if (token instanceof CommentToken) return ``;
+ if (token instanceof DOCTYPEToken) return ``;
+ if (token instanceof EndOfFileToken) return 'EOF';
+ if (token instanceof EndTagToken) return `${token.name}>`;
+ if (token instanceof StartTagToken) {
let string = `<${token.name}`;
for (const attribute of token.attributes.list)
string += ` ${attribute.name}="${attribute.value}"`;
- if (token.selfClosing) return `${string} />`;
+ // TODO: Implemement selfClosing
+ // if (token.selfClosing) return `${string} />`;
return `${string}>`;
- };
}
+
+ VERIFY_NOT_REACHED(token.constructor.name);
+
+ return '';
}
\ No newline at end of file