From cba7f2b58f71ddce34708c2d0b814390d8e4946e Mon Sep 17 00:00:00 2001 From: networkException Date: Tue, 4 Jan 2022 00:40:12 +0100 Subject: [PATCH] Token: Rewrite using classes, initial range implementation --- html/highlighter.ts | 53 ++++----- html/tokenizer.ts | 214 +++++++++++++++++++--------------- html/tokenizer/token.ts | 250 +++++++++++++++++++++++++++++++++++++--- 3 files changed, 383 insertions(+), 134 deletions(-) diff --git a/html/highlighter.ts b/html/highlighter.ts index 6e6b650..b34222f 100644 --- a/html/highlighter.ts +++ b/html/highlighter.ts @@ -47,7 +47,7 @@ export class Highlighter { } case State.Plain: { switch (this.consumeNextTokenType()) { - case Type.Character: this.currentNode.content += this.currentTokenOfType(Type.Character).data; break; + case Type.Character: this.currentNode.content += this.currentTokenOfType(CharacterToken).data; break; default: this.emitNode(this.currentNode); this.reconsumeIn(State.Undefined); @@ -56,15 +56,15 @@ export class Highlighter { break; } case State.StartTag: { - switch (this.consumeNextTokenOfType(Type.StartTag).name) { + switch (this.consumeNextTokenOfType(StartTagToken).name) { case 'script': this.returnState = State.BeforeScript; break; default: this.returnState = State.Undefined; break; } this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: `<` }); - this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Tag, content: this.currentTokenOfType(Type.StartTag).name }); + this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Tag, content: this.currentTokenOfType(StartTagToken).name }); - if (this.currentTokenOfType(Type.StartTag).attributes.nonEmpty()) { + if (this.currentTokenOfType(StartTagToken).attributes.nonEmpty()) { this.emitSpace({ line: 0, character: 0 }); this.reconsumeIn(State.Attributes); } @@ -77,7 +77,7 @@ export class Highlighter { } case State.EndTag: { this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '' }); - } else { - this.emitSpace({ line: 0, character: 0 }); - this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '/>' }); - } + // } else { + // this.emitSpace({ line: 0, character: 0 }); + // this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '/>' }); + // } break; case Type.EndTag: this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '>' }); @@ -132,7 +133,7 @@ export class Highlighter { } case State.Script: { switch (this.consumeNextTokenType()) { - case Type.Character: this.currentNode.content += this.currentTokenOfType(Type.Character).data; break; + case Type.Character: this.currentNode.content += this.currentTokenOfType(CharacterToken).data; break; default: this.emitNode(this.currentNode); this.reconsumeIn(State.Undefined); @@ -141,7 +142,7 @@ export class Highlighter { break; } case State.DOCTYPE: { - const doctype = this.consumeNextTokenOfType(Type.DOCTYPE); + const doctype = this.consumeNextTokenOfType(DOCTYPEToken); this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Punctuator, content: '` }); + this.emitNode({ position: { line: 0, character: 0 }, color: Palette.Comment, content: `` }); this.state = State.Undefined; break; @@ -177,24 +178,24 @@ export class Highlighter { return this.currentNode = node; } - private consumeNextTokenOfType(type: T): Token & { type: T } { + private consumeNextTokenOfType(type: Constructor): T { this.currentToken = this.tokens[this.pointer]; - VERIFY(this.currentToken.type === type, `Expected '${type}', got '${this.currentToken.type}' instead`); + VERIFY(this.currentToken instanceof type, `Expected '${type.name}', got '${this.currentToken.constructor.name}' instead`); this.pointer++; - return this.currentToken as Token & { type: T }; + return this.currentToken; } - private consumeNextTokenOfEitherType(a: T, b: U): Token & { type: T | U } { + private consumeNextTokenOfEitherType(a: Constructor, b: Constructor): T | U { this.currentToken = this.tokens[this.pointer]; - VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`); + VERIFY(this.currentToken instanceof a || this.currentToken instanceof b, `Expected '${a.name}' or '${b.name}', got '${this.currentToken.constructor.name}' instead`); this.pointer++; - return this.currentToken as Token & { type: T }; + return this.currentToken; } private consumeNextTokenType(): Type { @@ -211,16 +212,16 @@ export class Highlighter { return this.currentToken; } - private currentTokenOfType(type: T): Token & { type: T } { - VERIFY(this.currentToken.type === type, `Expected '${type}', got '${this.currentToken.type}' instead`); + private currentTokenOfType(type: Constructor): T { + VERIFY(this.currentToken instanceof type, `Expected '${type.name}', got '${this.currentToken.constructor.name}' instead`); - return this.currentToken as Token & { type: T }; + return this.currentToken; } - private currentTokenOfEitherType(a: T, b: U): Token & { type: T | U } { - VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`); + private currentTokenOfEitherType(a: Constructor, b: Constructor): T | U { + VERIFY(this.currentToken instanceof a || this.currentToken instanceof b, `Expected '${a.name}' or '${b.name}', got '${this.currentToken.constructor.name}' instead`); - return this.currentToken as Token & { type: T }; + return this.currentToken; } private reconsumeIn(state: State): void { diff --git a/html/tokenizer.ts b/html/tokenizer.ts index 98476fe..6709751 100644 --- a/html/tokenizer.ts +++ b/html/tokenizer.ts @@ -1,8 +1,9 @@ -import { TODO, VERIFY } from "../util/assertions.js"; +import { TODO, VERIFY, VERIFY_NOT_REACHED } from "../util/assertions.js"; +import { Constructor } from "../util/guards.js"; import { ParseError } from "./errors.js"; import { entities } from "./tokenizer/entities.js"; import { State } from "./tokenizer/state.js"; -import { AttributeList, Token, Type } from "./tokenizer/token.js"; +import { Attribute, CharacterToken, CommentToken, DOCTYPEToken, EndOfFileToken, EndTagToken, Position, StartTagToken, Token } from "./tokenizer/token.js"; export class Tokenizer { private state: State = State.Data; @@ -13,6 +14,8 @@ export class Tokenizer { private currentToken!: Token; private currentInputCharacter!: string; + private currentPosition: Position = { line: 0, column: 0, index: 0 }; + public tokens: Array = new Array(); private pointer: number = 0; @@ -30,10 +33,10 @@ export class Tokenizer { case '\u003C': this.state = State.TagOpen; break; case '\u0000': this.parseError('unexpected-null-character'); - this.emit({ type: Type.Character, data: this.currentInputCharacter }); + this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition)); break; - case undefined: this.emit({ type: Type.EndOfFile }); break; - default: this.emit({ type: Type.Character, data: this.currentInputCharacter }); + case undefined: this.emit(EndOfFileToken.create()); break; + default: this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition)); } break; @@ -41,9 +44,9 @@ export class Tokenizer { case State.RCDATA: { switch (this.consumeNext()) { case '\u003C': this.state = State.RAWTEXTLessThan; break; - case '\u0000': this.parseError('unexpected-null-character'); this.emit({ type: Type.Character, data: '\uFFFD' }); break; - case undefined: this.emit({ type: Type.EndOfFile }); break; - default: this.emit({ type: Type.Character, data: this.currentInputCharacter }); + case '\u0000': this.parseError('unexpected-null-character'); this.emit(CharacterToken.createReplacementCharacter().at(this.currentPosition)); break; + case undefined: this.emit(EndOfFileToken.create()); break; + default: this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition)); } break; @@ -54,23 +57,23 @@ export class Tokenizer { case '\u002F': this.state = State.EndTagOpen; break; case '\u003F': this.parseError('unexpected-question-mark-instead-of-tag-name'); - this.create({ type: Type.Comment, data: '' }); + this.create(CommentToken.createEmpty().startingAt(this.currentPosition)); this.reconsumeIn(State.BogusComment); break; case undefined: this.parseError('eof-before-tag-name'); - this.emit({ type: Type.Character, data: '\u003C' }); - this.emit({ type: Type.EndOfFile }); + this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition)); + this.emit(EndOfFileToken.create()); break; default: { if (this.asciiAlpha(this.currentInputCharacter)) { - this.create({ type: Type.StartTag, name: '', attributes: new AttributeList() }); + this.create(StartTagToken.createEmpty().startingAt(this.currentPosition)); this.reconsumeIn(State.TagName); break; } this.parseError('invalid-first-character-of-tag-name'); - this.emit({ type: Type.Character, data: '\u003C' }); + this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition)); this.reconsumeIn(State.Data); } } @@ -82,19 +85,19 @@ export class Tokenizer { case '\u003E': this.parseError('missing-end-tag-name'); this.state = State.Data; break; case undefined: this.parseError('eof-before-tag-name'); - this.emit({ type: Type.Character, data: '\u003C' }); - this.emit({ type: Type.Character, data: '\u002F' }); - this.emit({ type: Type.EndOfFile }); + this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition)); + this.emit(CharacterToken.createWith('\u002F').at(this.currentPosition)); + this.emit(EndOfFileToken.create()); break; default: { if (this.asciiAlpha(this.currentInputCharacter)) { - this.create({ type: Type.EndTag, name: '', attributes: new AttributeList() }); + this.create(EndTagToken.createEmpty().startingAt(this.currentPosition)); this.reconsumeIn(State.TagName); break; } this.parseError('invalid-first-character-of-tag-name'); - this.create({ type: Type.Comment, data: '' }); + this.create(CommentToken.createEmpty().startingAt(this.currentPosition)); this.reconsumeIn(State.BogusComment); } } @@ -104,7 +107,7 @@ export class Tokenizer { case State.MarkupDeclarationOpen: { if (this.matchNextFew('--')) { this.consumeNextFew('--'); - this.create({ type: Type.Comment, data: '' }); + this.create(CommentToken.createEmpty().startingAt(this.currentPosition)); this.state = State.CommentStart; } else if (this.matchNextFewCaseInsensitive('DOCTYPE')) { this.consumeNextFewCaseInsensitive('DOCTYPE'); @@ -114,11 +117,11 @@ export class Tokenizer { // NOTE: This parser will never be generated as part of the fragment parsing algorithm, as such the CDATA section state does not // exist and will not be started here. this.parseError('cdata-in-html-content'); - this.create({ type: Type.Comment, data: '[CDATA[' }); + this.create(CommentToken.createWith('[CDATA[').startingAt(this.currentPosition)); this.state = State.BogusComment; } else { this.parseError('incorrectly-opened-comment'); - this.create({ type: Type.Comment, data: '' }); + this.create(CommentToken.createEmpty().startingAt(this.currentPosition)); this.state = State.BogusComment; } @@ -133,8 +136,8 @@ export class Tokenizer { case '\u003E': this.reconsumeIn(State.BeforeDOCTYPEName); break; case undefined: this.parseError('eof-in-doctype'); - this.emit({ type: Type.DOCTYPE, forceQuirks: true }); - this.emit({ type: Type.EndOfFile }); + this.emit(DOCTYPEToken.createWithForcedQuirks().at(this.currentPosition)); + this.emit(EndOfFileToken.create()); break; default: this.parseError('missing-whitespace-before-doctype-name'); @@ -151,22 +154,22 @@ export class Tokenizer { case '\u0020': break; case '\u0000': this.parseError('unexpected-null-character'); - this.create({ type: Type.DOCTYPE, name: '\uFFFD' }); + this.create(DOCTYPEToken.createWithName('\uFFFD').startingAt(this.currentPosition)); this.state = State.DOCTYPEName; break; case undefined: this.parseError('eof-in-doctype'); - this.emit({ type: Type.DOCTYPE, forceQuirks: true }); - this.emit({ type: Type.EndOfFile }); + this.emit(DOCTYPEToken.createWithForcedQuirks().at(this.currentPosition)); + this.emit(EndOfFileToken.create()); break; default: { if (this.asciiUpperAlpha(this.currentInputCharacter)) { - this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter.toLowerCase()}); + this.create(DOCTYPEToken.createWithName(this.currentInputCharacter.toLowerCase()).startingAt(this.currentPosition)); this.state = State.DOCTYPEName; break; } - this.create({ type: Type.DOCTYPE, name: this.currentInputCharacter }); + this.create(DOCTYPEToken.createWithName(this.currentInputCharacter).startingAt(this.currentPosition)); this.state = State.DOCTYPE; } } @@ -179,21 +182,21 @@ export class Tokenizer { case '\u000A': case '\u000C': case '\u0020': this.state = State.AfterDOCTYPEName; break; - case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.DOCTYPE); break; - case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.DOCTYPE)!.name += '\uFFFD'; break; + case '\u003E': this.state = State.Data; this.emitCurrentOfType(DOCTYPEToken); break; + case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(DOCTYPEToken).appendReplacementCharacterToName(); break; case undefined: this.parseError('eof-in-doctype'); - this.currentOfType(Type.DOCTYPE).forceQuirks = true; - this.emitCurrentOfType(Type.DOCTYPE); - this.emit({ type: Type.EndOfFile }); + this.currentOfType(DOCTYPEToken).forceQuirks = true; + this.emitCurrentOfType(DOCTYPEToken); + this.emit(EndOfFileToken.create()); break; default: { if (this.asciiUpperAlpha(this.currentInputCharacter)) { - this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter.toLowerCase(); + this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter.toLowerCase()); break; } - this.currentOfType(Type.DOCTYPE)!.name += this.currentInputCharacter; + this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter); } } @@ -206,19 +209,19 @@ export class Tokenizer { case '\u000C': case '\u0020': this.state = State.BeforeAttributeName; break; case '\u002F': this.state = State.SelfClosingStartTag; break; - case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break; + case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break; case '\u0000': this.parseError('unexpected-null-character'); - this.currentOfEitherType(Type.StartTag, Type.EndTag).name += '\uFFFD'; + this.currentOfEitherType(StartTagToken, EndTagToken).appendReplacementCharacterToName(); break; - case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break; + case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; default: { if (this.asciiUpperAlpha(this.currentInputCharacter)) { - this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter.toLowerCase(); + this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter.toLowerCase()); break; } - this.currentOfEitherType(Type.StartTag, Type.EndTag).name += this.currentInputCharacter; + this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter); } } @@ -235,12 +238,12 @@ export class Tokenizer { case undefined: this.reconsumeIn(State.AfterAttributeName); break; case '\u003D': { this.parseError('unexpected-equals-sign-before-attribute-name'); - this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: this.currentInputCharacter, value: '' }); + this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyValue(this.currentInputCharacter)); this.state = State.AttributeName; break; } default: { - this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' }); + this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyNameAndValue()); this.reconsumeIn(State.AttributeName); } } @@ -258,21 +261,21 @@ export class Tokenizer { case undefined: this.reconsumeIn(State.AfterAttributeName); break; case '\u003D': this.state = State.BeforeAttributeValue; break; case '\u0000': this.parseError('unexpected-null-character'); - this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += '\uFFFD'; + this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToName(); break; case '\u0022': case '\u0027': case '\u003C': this.parseError('unexpected-character-in-attribute-name'); - this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter; + this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter); break; default: { if (this.asciiUpperAlpha(this.currentInputCharacter)) { - this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter.toLowerCase(); + this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter.toLowerCase()); break; } - this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.name += this.currentInputCharacter; + this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter); } } @@ -286,10 +289,10 @@ export class Tokenizer { case '\u0020': break; case '\u002F': this.state = State.SelfClosingStartTag; break; case '\u003D': this.state = State.BeforeAttributeValue; break; - case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break; - case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break; + case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break; + case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; default: - this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.append({ name: '', value: '' }); + this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyNameAndValue()); this.reconsumeIn(State.AttributeName); break; } @@ -307,7 +310,7 @@ export class Tokenizer { case '\u003E': this.parseError('missing-attribute-value'); this.state = State.Data; - this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); + this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break; default: this.reconsumeIn(State.AttributeValueUnquoted); @@ -321,10 +324,10 @@ export class Tokenizer { case '\u0026': this.returnState = State.AttributeValueDouble; this.state = State.CharacterReference; break; case '\u0000': this.parseError('unexpected-null-character'); - this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD'; + this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue(); break; - case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break; - default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter; + case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; + default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter); } break; @@ -335,10 +338,10 @@ export class Tokenizer { case '\u0026': this.returnState = State.AttributeValueSingle; this.state = State.CharacterReference; break; case '\u0000': this.parseError('unexpected-null-character'); - this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD'; + this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue(); break; - case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break; - default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter; + case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; + default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter); } break; @@ -350,10 +353,10 @@ export class Tokenizer { case '\u000C': case '\u0020': this.state = State.BeforeAttributeName; break; case '\u0026': this.returnState = State.AttributeValueUnquoted; this.state = State.CharacterReference; break; - case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break; + case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break; case '\u0000': this.parseError('unexpected-null-character'); - this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += '\uFFFD'; + this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue(); break; case '\u0022': case '\u0027': @@ -361,10 +364,10 @@ export class Tokenizer { case '\u003D': case '\u0060': this.parseError('unexpected-character-in-unquoted-attribute-value'); - this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter; + this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter); break; - case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break; - default: this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter; + case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; + default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter); } break; @@ -376,8 +379,8 @@ export class Tokenizer { case '\u000C': case '\u0020': this.state = State.BeforeAttributeName; break; case '\u002F': this.state = State.SelfClosingStartTag; break; - case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(Type.StartTag, Type.EndTag); break; - case undefined: this.parseError('eof-in-tag'); this.emit({ type: Type.EndOfFile }); break; + case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break; + case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; default: this.parseError('missing-whitespace-between-attributes'); this.reconsumeIn(State.BeforeAttributeName); } @@ -386,7 +389,7 @@ export class Tokenizer { case State.CommentStart: { switch (this.consumeNext()) { case '\u002D': this.state = State.CommentStartDash; break; - case '\u003E': this.parseError('abrupt-closing-of-empty-comment'); this.state = State.Data; this.emitCurrentOfType(Type.Comment); break; + case '\u003E': this.parseError('abrupt-closing-of-empty-comment'); this.state = State.Data; this.emitCurrentOfType(CommentToken); break; default: this.reconsumeIn(State.Comment); } @@ -395,11 +398,11 @@ export class Tokenizer { // FIXME: Possible improvement to https://html.spec.whatwg.org/multipage/parsing.html#comment-state (adding **current** in some places) case State.Comment: { switch (this.consumeNext()) { - case '\u003C': this.currentOfType(Type.Comment).data += this.currentInputCharacter; this.state = State.CommentLessThanSign; break; + case '\u003C': this.currentOfType(CommentToken).append(this.currentInputCharacter); this.state = State.CommentLessThanSign; break; case '\u002D': this.state = State.CommentEndDash; break; - case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break; - case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break; - default: this.currentOfType(Type.Comment).data += this.currentInputCharacter; + case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(CommentToken).appendReplacementCharacter(); break; + case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break; + default: this.currentOfType(CommentToken).append(this.currentInputCharacter); } break; @@ -407,8 +410,8 @@ export class Tokenizer { case State.CommentEndDash: { switch (this.consumeNext()) { case '\u002D': this.state = State.CommentEnd; break; - case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break; - default: this.currentOfType(Type.Comment).data += '\u002D'; this.reconsumeIn(State.Comment); + case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break; + default: this.currentOfType(CommentToken).append('\u002D'); this.reconsumeIn(State.Comment); } break; @@ -416,11 +419,11 @@ export class Tokenizer { // Same as above fixme https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state case State.CommentEnd: { switch (this.consumeNext()) { - case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break; + case '\u003E': this.state = State.Data; this.emitCurrentOfType(CommentToken); break; case '\u0021': this.state = State.CommentEndBang; break; - case '\u002D': this.currentOfType(Type.Comment).data += '\u002D'; break; - case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break; - default: this.currentOfType(Type.Comment).data += '\u002D\u002D'; this.reconsumeIn(State.Comment); + case '\u002D': this.currentOfType(CommentToken).append('\u002D'); break; + case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break; + default: this.currentOfType(CommentToken).append('\u002D\u002D'); this.reconsumeIn(State.Comment); } break; @@ -428,10 +431,10 @@ export class Tokenizer { // Same as above https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state case State.BogusComment: { switch (this.consumeNext()) { - case '\u003E': this.state = State.Data; this.emitCurrentOfType(Type.Comment); break; - case undefined: this.emitCurrentOfType(Type.Comment); this.emit({ type: Type.EndOfFile }); break; - case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(Type.Comment).data += '\uFFFD'; break; - default: this.currentOfType(Type.Comment).data += this.currentInputCharacter; + case '\u003E': this.state = State.Data; this.emitCurrentOfType(CommentToken); break; + case undefined: this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break; + case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(CommentToken).appendReplacementCharacter(); break; + default: this.currentOfType(CommentToken).append(this.currentInputCharacter); } break; @@ -496,9 +499,9 @@ export class Tokenizer { default: { if (this.asciiAlphanumeric(this.currentInputCharacter)) { if (this.consumedAsPartOfAnAttribute()) { - this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.currentInputCharacter; + this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter); } else { - this.emit({ type: Type.Character, data: this.currentInputCharacter }); + this.emit(CharacterToken.createWith(this.currentInputCharacter)); } break; @@ -516,12 +519,12 @@ export class Tokenizer { private flushCodePointsConsumedAsCharacterReference(): void { if (this.consumedAsPartOfAnAttribute()) { - this.currentOfEitherType(Type.StartTag, Type.EndTag).attributes.current.value += this.temporaryBuffer; + this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.temporaryBuffer); return; } for (const codePoint of this.temporaryBuffer) - this.emit({ type: Type.Character, data: codePoint }); + this.emit(CharacterToken.createWith(codePoint)); } private consumedAsPartOfAnAttribute(): boolean { @@ -562,6 +565,14 @@ export class Tokenizer { this.currentInputCharacter = this.input[this.pointer]; this.pointer++; + this.currentPosition.column++; + this.currentPosition.index++; + + if (this.currentInputCharacter === '\n') { + this.currentPosition.column = 0; + this.currentPosition.line++; + } + return this.currentInputCharacter; } @@ -594,34 +605,53 @@ export class Tokenizer { } private emit(token: Token): void { + this.populateRangeOnEmit(token); this.tokens.push(token); } - private emitCurrentOfType(type: Type): void { - VERIFY(this.currentToken.type === type, `Expected '${type}', got '${this.currentToken.type}' instead`); + private emitCurrentOfType(type: Constructor): void { + VERIFY(this.currentToken instanceof type, `Expected '${type.name}', got '${this.currentToken.constructor.name}' instead`); + this.populateRangeOnEmit(this.currentToken); this.tokens.push(this.currentToken); } - private emitCurrentOfEitherType(a: Type, b: Type): void { - VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`); + private emitCurrentOfEitherType(a: Constructor, b: Constructor): void { + VERIFY(this.currentToken instanceof a || this.currentToken instanceof b, `Expected '${a.name}' or '${b.name}', got '${this.currentToken.constructor.name}' instead`); + this.populateRangeOnEmit(this.currentToken); this.tokens.push(this.currentToken); } - private currentOfType(type: T): Token & { type: T } { - VERIFY(this.currentToken.type === type, `Expected '${type}', got '${this.currentToken.type}' instead`); + private currentOfType(type: Constructor): T { + VERIFY(this.currentToken instanceof type, `Expected '${type.name}', got '${this.currentToken.constructor.name}' instead`); - return this.currentToken as Token & { type: T }; + this.populateRangeOnEmit(this.currentToken); + return this.currentToken; } - private currentOfEitherType(a: T, b: U): Token & { type: T | U } { - VERIFY(this.currentToken.type === a || this.currentToken.type === b, `Expected '${a}' or '${b}', got '${this.currentToken.type}' instead`); + private currentOfEitherType(a: Constructor, b: Constructor): T | U { + VERIFY(this.currentToken instanceof a || this.currentToken instanceof b, `Expected '${a.name}' or '${b.name}', got '${this.currentToken.constructor.name}' instead`); - return this.currentToken as Token & { type: T }; + this.populateRangeOnEmit(this.currentToken); + return this.currentToken; + } + + private populateRangeOnEmit(token: Token): void { + if (token.range.start === undefined && token.range.end === undefined) + token.at(this.currentPosition); + + if (token.range.start !== undefined && token.range.end === undefined) + token.endingAt(this.currentPosition); + + if (token.range.start === undefined && token.range.end !== undefined) + VERIFY_NOT_REACHED(); } private create(token: Token): Token { + if (token.range.start === undefined) + token.startingAt(this.currentPosition); + return this.currentToken = token; } } diff --git a/html/tokenizer/token.ts b/html/tokenizer/token.ts index 1c9a789..ea1aced 100644 --- a/html/tokenizer/token.ts +++ b/html/tokenizer/token.ts @@ -1,3 +1,5 @@ +import { VERIFY, VERIFY_NOT_REACHED } from "../../util/assertions.js"; + export const enum Type { DOCTYPE = 'DOCTYPE', StartTag = 'start tag', @@ -7,7 +9,52 @@ export const enum Type { EndOfFile = 'end-of-file' } -export type Attribute = { name: NonNullable, value: NonNullable }; +export const REPLACEMENT_CHARACTER = '\uFFFD'; + +export type Range = { + start: Position, + end: Position +} + +export type Position = { + line: number, + column: number, + index: number +} + +export class Attribute { + public name: string; + public value: string; + + public constructor(name: string, value: string) { + this.name = name; + this.value = value; + } + + public appendToName(characters: string): void { + this.name += characters; + } + + public appendReplacementCharacterToName(): void { + this.appendToName(REPLACEMENT_CHARACTER); + } + + public appendToValue(characters: string): void { + this.value += characters; + } + + public appendReplacementCharacterToValue(): void { + this.appendToValue(REPLACEMENT_CHARACTER); + } + + public static createWithEmptyNameAndValue(): Attribute { + return new Attribute('', ''); + } + + public static createWithEmptyValue(name: string): Attribute { + return new Attribute(name, ''); + } +} export class AttributeList { private attributes: Array; @@ -33,29 +80,200 @@ export class AttributeList { } } -export type Token = { type: Type.DOCTYPE, name?: string, publicIdentifier?: string, systemIdentifier?: string, forceQuirks?: true } | - { type: Type.StartTag, name: NonNullable, selfClosing?: true, attributes: AttributeList } | - { type: Type.EndTag, name: NonNullable, selfClosing?: true, attributes: AttributeList } | - { type: Type.Comment, data: NonNullable } | - { type: Type.Character, data: NonNullable } | - { type: Type.EndOfFile }; +export abstract class Token { + #type: Type; + #range!: Range; + + protected constructor(type: Type) { + this.#type = type; + + // @ts-expect-error + this.#range = {}; + } + + public startingAt(position: Position): this { + this.#range.start = { line: position.line, column: position.column, index: position.index }; + + return this; + } + + public endingAt(position: Position): this { + this.#range.end = { line: position.line, column: position.column, index: position.index }; + + return this; + } + + public at(position: Position): this { + this.#range.start = { line: position.line, column: position.column, index: position.index }; + this.#range.end = { line: position.line, column: position.column, index: position.index }; + + return this; + } + + public get range(): Range { + return this.#range; + } + + public get type(): Type { + return this.#type; + } +} + +export class CharacterToken extends Token { + public readonly data: NonNullable; + + public constructor(data: NonNullable) { + super(Type.Character); + + this.data = data; + } + + public static createWith(data: NonNullable): CharacterToken { + return new CharacterToken(data); + } + + public static createReplacementCharacter(): CharacterToken { + return new CharacterToken(REPLACEMENT_CHARACTER); + } +} + +export class CommentToken extends Token { + public data: NonNullable; + + public constructor(data: NonNullable) { + super(Type.Comment); + + this.data = data; + } + + public append(characters: string): void { + this.data += characters; + } + + public appendReplacementCharacter(): void { + this.append(REPLACEMENT_CHARACTER); + } + + public static createEmpty(): CommentToken { + return new CommentToken(''); + } + + public static createWith(data: string): CommentToken { + return new CommentToken(data); + } +} + +export class EndOfFileToken extends Token { + public constructor() { + super(Type.EndOfFile); + } + + public static create(): EndOfFileToken { + return new EndOfFileToken(); + } +} + +export class StartTagToken extends Token { + public name: NonNullable; + public readonly attributes: AttributeList; + + public constructor(name: NonNullable, attributes: AttributeList) { + super(Type.StartTag); + + this.name = name; + this.attributes = attributes; + } + + public appendToName(characters: string): void { + this.name += characters; + } + + public appendReplacementCharacterToName(): void { + this.appendToName(REPLACEMENT_CHARACTER); + } + + public static createEmpty(): StartTagToken { + return new StartTagToken('', new AttributeList()); + } +} + +export class EndTagToken extends Token { + public name: NonNullable; + public readonly attributes: AttributeList; + + public constructor(name: NonNullable, attributes: AttributeList) { + super(Type.EndTag); + + this.name = name; + this.attributes = attributes; + } + + public appendToName(characters: string): void { + this.name += characters; + } + + public appendReplacementCharacterToName(): void { + this.appendToName(REPLACEMENT_CHARACTER); + } + + public static createEmpty(): EndTagToken { + return new EndTagToken('', new AttributeList()); + } +} + +export class DOCTYPEToken extends Token { + public name?: string; + public publicIdentifier?: string; + public systemIdentifier?: string; + public forceQuirks?: true; + + public constructor(name?: string, publicIdentifier?: string, systemIdentifier?: string, forceQuirks?: true) { + super(Type.DOCTYPE); + + this.name = name; + this.publicIdentifier = publicIdentifier; + this.systemIdentifier = systemIdentifier; + this.forceQuirks = forceQuirks; + } + + public appendToName(characters: string): void { + VERIFY(this.name !== undefined); + + this.name += characters; + } + + public appendReplacementCharacterToName(): void { + this.appendToName(REPLACEMENT_CHARACTER); + } + + public static createWithForcedQuirks(): DOCTYPEToken { + return new DOCTYPEToken(undefined, undefined, undefined, true); + } + + public static createWithName(name: string): DOCTYPEToken { + return new DOCTYPEToken(name, undefined, undefined, undefined); + } +} export function stringify(token: Token): string { - switch (token.type) { - case Type.Character: return token.data; - case Type.Comment: return ``; - case Type.DOCTYPE: return ``; - case Type.EndOfFile: return 'EOF'; - case Type.EndTag: return ``; - case Type.StartTag: { + if (token instanceof CharacterToken) return token.data; + if (token instanceof CommentToken) return ``; + if (token instanceof DOCTYPEToken) return ``; + if (token instanceof EndOfFileToken) return 'EOF'; + if (token instanceof EndTagToken) return ``; + if (token instanceof StartTagToken) { let string = `<${token.name}`; for (const attribute of token.attributes.list) string += ` ${attribute.name}="${attribute.value}"`; - if (token.selfClosing) return `${string} />`; + // TODO: Implemement selfClosing + // if (token.selfClosing) return `${string} />`; return `${string}>`; - }; } + + VERIFY_NOT_REACHED(token.constructor.name); + + return ''; } \ No newline at end of file