diff --git a/src/html/errors.ts b/src/html/errors.ts
index f240d63..6289d26 100644
--- a/src/html/errors.ts
+++ b/src/html/errors.ts
@@ -17,4 +17,5 @@ export type ParseError = 'unexpected-null-character' |
'abrupt-closing-of-empty-comment' |
'eof-in-comment' |
'missing-semicolon-after-character-reference' |
- 'unknown-named-character-reference';
+ 'unknown-named-character-reference' |
+ 'missing-doctype-name';
diff --git a/src/html/tokenizer.ts b/src/html/tokenizer.ts
index 817ef86..efc30fe 100644
--- a/src/html/tokenizer.ts
+++ b/src/html/tokenizer.ts
@@ -32,524 +32,1168 @@ export class Tokenizer {
public spin(): void {
switch (this.state) {
+ // 13.2.5.1 Data state https://html.spec.whatwg.org/multipage/parsing.html#data-state
case State.Data: {
+ // Consume the next input character:
switch (this.consumeNext()) {
+ // U+0026 AMPERSAND (&)
case '\u0026':
+ // Set the return state to the data state.
this.returnState = State.Data;
+
+ // Switch to the character reference state.
this.state = State.CharacterReference;
break;
+ // U+003C LESS-THAN SIGN (<): Switch to the tag open state.
case '\u003C': this.state = State.TagOpen; break;
+ // U+0000 NULL
case '\u0000':
+ // This is an unexpected-null-character parse error.
this.parseError('unexpected-null-character');
+
+ // Emit the current input character as a character token.
this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition));
break;
+ // EOF: Emit an end-of-file token.
case undefined: this.emit(EndOfFileToken.create()); break;
+ // Anything else: Emit the current input character as a character token.
default: this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition));
}
break;
}
+ // 13.2.5.2 RCDATA state https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state
case State.RCDATA: {
+ // Consume the next input character:
switch (this.consumeNext()) {
- case '\u003C': this.state = State.RAWTEXTLessThan; break;
- case '\u0000': this.parseError('unexpected-null-character'); this.emit(CharacterToken.createReplacementCharacter().at(this.currentPosition)); break;
+ // U+0026 AMPERSAND (&)
+ case '\u0026':
+ // Set the return state to the RCDATA state.
+ this.returnState = State.RCDATA;
+
+ // Switch to the character reference state.
+ this.state = State.CharacterReference;
+ break;
+ // U+003C LESS-THAN SIGN (<): Switch to the RCDATA less-than sign state.
+ case '\u003C': this.state = State.RCDATALessThanSign; break;
+ // U+0000 NULL
+ case '\u0000':
+ // This is an unexpected-null-character parse error.
+ this.parseError('unexpected-null-character');
+
+ // Emit a U+FFFD REPLACEMENT CHARACTER character token.
+ this.emit(CharacterToken.createReplacementCharacter().at(this.currentPosition));
+ break;
+ // EOF: Emit an end-of-file token.
case undefined: this.emit(EndOfFileToken.create()); break;
+ // Anything else: Emit the current input character as a character token.
default: this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition));
}
break;
}
+ // 13.2.5.6 Tag open state https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state
case State.TagOpen: {
+ // Consume the next input character:
switch (this.consumeNext()) {
+ // U+0021 EXCLAMATION MARK (!): Switch to the markup declaration open state.
case '\u0021': this.state = State.MarkupDeclarationOpen; break;
+ // U+002F SOLIDUS (/): Switch to the end tag open state.
case '\u002F': this.state = State.EndTagOpen; break;
+ // U+003F QUESTION MARK (?)
case '\u003F':
+ // This is an unexpected-question-mark-instead-of-tag-name parse error.
this.parseError('unexpected-question-mark-instead-of-tag-name');
+
+ // Create a comment token whose data is the empty string.
this.create(CommentToken.createEmpty().startingAt(this.currentPosition));
+
+ // Reconsume in the bogus comment state.
this.reconsumeIn(State.BogusComment);
break;
+ // EOF
case undefined:
+ // This is an eof-before-tag-name parse error.
this.parseError('eof-before-tag-name');
+
+ // Emit a U+003C LESS-THAN SIGN character token
this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition));
+
+ // and an end-of-file token.
this.emit(EndOfFileToken.create());
break;
default: {
+ // ASCII alpha
if (this.asciiAlpha(this.currentInputCharacter)) {
+ // Create a new start tag token, set its tag name to the empty string.
this.create(StartTagToken.createEmpty().startingAt(this.currentPosition));
+
+ // Reconsume in the tag name state.
this.reconsumeIn(State.TagName);
break;
}
+ // Anything else
+ // This is an invalid-first-character-of-tag-name parse error.
this.parseError('invalid-first-character-of-tag-name');
+
+ // Emit a U+003C LESS-THAN SIGN character token.
this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition));
+
+ // Reconsume in the data state.
this.reconsumeIn(State.Data);
}
}
break;
}
+ // 13.2.5.7 End tag open state https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state
case State.EndTagOpen: {
+ // Consume the next input character:
switch (this.consumeNext()) {
- case '\u003E': this.parseError('missing-end-tag-name'); this.state = State.Data; break;
+ // U+003E GREATER-THAN SIGN (>)
+ case '\u003E':
+ // This is a missing-end-tag-name parse error.
+ this.parseError('missing-end-tag-name');
+
+ // Switch to the data state.
+ this.state = State.Data;
+ break;
+ // EOF
case undefined:
+ // This is an eof-before-tag-name parse error.
this.parseError('eof-before-tag-name');
+
+ // Emit a U+003C LESS-THAN SIGN character token,
this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition));
+
+ // a U+002F SOLIDUS character token
this.emit(CharacterToken.createWith('\u002F').at(this.currentPosition));
+
+ // and an end-of-file token.
this.emit(EndOfFileToken.create());
break;
default: {
+ // ASCII alpha
if (this.asciiAlpha(this.currentInputCharacter)) {
+ // Create a new end tag token, set its tag name to the empty string.
this.create(EndTagToken.createEmpty().startingAt(this.currentPosition));
+
+ // Reconsume in the tag name state.
this.reconsumeIn(State.TagName);
break;
}
+ // Anything else
+ // This is an invalid-first-character-of-tag-name parse error.
this.parseError('invalid-first-character-of-tag-name');
+
+ // Create a comment token whose data is the empty string.
this.create(CommentToken.createEmpty().startingAt(this.currentPosition));
+
+ // Reconsume in the bogus comment state.
this.reconsumeIn(State.BogusComment);
}
}
break;
}
+ // 13.2.5.42 Markup declaration open state https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state
case State.MarkupDeclarationOpen: {
+ // If the next few characters are:
+ // Two U+002D HYPHEN-MINUS characters (-)
if (this.matchNextFew('--')) {
+ // Consume those two characters,
this.consumeNextFew('--');
+
+ // create a comment token whose data is the empty string,
this.create(CommentToken.createEmpty().startingAt(this.currentPosition.copy().decrement(4)));
+
+ // and switch to the comment start state.
this.state = State.CommentStart;
+ // ASCII case-insensitive match for the word "DOCTYPE"
} else if (this.matchNextFewCaseInsensitive('DOCTYPE')) {
+ // Consume those characters
this.consumeNextFewCaseInsensitive('DOCTYPE');
+
+ // and switch to the DOCTYPE state.
this.state = State.DOCTYPE;
+ // The string "[CDATA[" (the five uppercase letters "CDATA" with a U+005B LEFT SQUARE BRACKET character before and after)
} else if (this.matchNextFew('[CDATA[')) {
+ // Consume those characters
this.consumeNextFew('[CDATA[');
// NOTE: This parser will never be generated as part of the fragment parsing algorithm, as such the CDATA section state does not
// exist and will not be started here.
+
+ // This is a cdata-in-html-content parse error.
this.parseError('cdata-in-html-content');
+
+ // Create a comment token whose data is the "[CDATA[" string.
this.create(CommentToken.createWith('[CDATA[').startingAt(this.currentPosition));
+
+ // Switch to the bogus comment state.
this.state = State.BogusComment;
+ // Anything else
} else {
+ // This is an incorrectly-opened-comment parse error.
this.parseError('incorrectly-opened-comment');
+
+ // Create a comment token whose data is the empty string.
this.create(CommentToken.createEmpty().startingAt(this.currentPosition));
+
+ // Switch to the bogus comment state (don't consume anything in the current state).
this.state = State.BogusComment;
}
break;
}
+ // 13.2.5.53 DOCTYPE state https://html.spec.whatwg.org/multipage/parsing.html#doctype-state
case State.DOCTYPE: {
+ // Consume the next input character:
switch (this.consumeNext()) {
+ // U+0009 CHARACTER TABULATION (tab)
+ // U+000A LINE FEED (LF)
+ // U+000C FORM FEED (FF)
+ // U+0020 SPACE
case '\u0009':
case '\u000A':
case '\u000C':
- case '\u0020': this.state = State.BeforeDOCTYPEName; break;
- case '\u003E': this.reconsumeIn(State.BeforeDOCTYPEName); break;
+ case '\u0020':
+ // Switch to the before DOCTYPE name state.
+ this.state = State.BeforeDOCTYPEName;
+ break;
+ // U+003E GREATER-THAN SIGN (>)
+ case '\u003E':
+ // Reconsume in the before DOCTYPE name state.
+ this.reconsumeIn(State.BeforeDOCTYPEName);
+ break;
+ // EOF
case undefined:
+ // This is an eof-in-doctype parse error.
this.parseError('eof-in-doctype');
+
+ // Create a new DOCTYPE token. Set its force-quirks flag to on. Emit the current token.
this.emit(DOCTYPEToken.createWithForcedQuirks().at(this.currentPosition));
+
+ // Emit an end-of-file token.
this.emit(EndOfFileToken.create());
break;
+ // Anything else
default:
+ // This is a missing-whitespace-before-doctype-name parse error.
this.parseError('missing-whitespace-before-doctype-name');
+
+ // Reconsume in the before DOCTYPE name state.
this.reconsumeIn(State.BeforeDOCTYPEName);
}
break;
}
+ // 13.2.5.54 Before DOCTYPE name state https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state
case State.BeforeDOCTYPEName: {
+ // Consume the next input character:
switch (this.consumeNext()) {
+ // U+0009 CHARACTER TABULATION (tab)
+ // U+000A LINE FEED (LF)
+ // U+000C FORM FEED (FF)
+ // U+0020 SPACE
case '\u0009':
case '\u000A':
case '\u000C':
- case '\u0020': break;
+ case '\u0020':
+ // Ignore the character.
+ break;
+ // U+0000 NULL
case '\u0000':
+ // This is an unexpected-null-character parse error.
this.parseError('unexpected-null-character');
- this.create(DOCTYPEToken.createWithName('\uFFFD').startingAt(this.currentPosition));
+
+ // Create a new DOCTYPE token. Set the token's name to a U+FFFD REPLACEMENT CHARACTER character.
+ this.create(DOCTYPEToken.createWithReplacementCharacter().startingAt(this.currentPosition));
+
+ // Switch to the DOCTYPE name state.
this.state = State.DOCTYPEName;
break;
+ // U+003E GREATER-THAN SIGN (>)
+ case '\u003E':
+ // This is a missing-doctype-name parse error.
+ this.parseError('missing-doctype-name');
+
+ // Create a new DOCTYPE token. Set its force-quirks flag to on.
+ this.create(DOCTYPEToken.createWithForcedQuirks().at(this.currentPosition));
+
+ // Switch to the data state.
+ this.state = State.Data;
+
+ // Emit the current token.
+ this.emitCurrentOfType(DOCTYPEToken);
+ break;
+ // EOF
case undefined:
+ // This is an eof-in-doctype parse error.
this.parseError('eof-in-doctype');
+
+ // Create a new DOCTYPE token. Set its force-quirks flag to on. Emit the current token.
this.emit(DOCTYPEToken.createWithForcedQuirks().at(this.currentPosition));
+
+ // Emit an end-of-file token.
this.emit(EndOfFileToken.create());
break;
default: {
+ // ASCII upper alpha
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
+ // Create a new DOCTYPE token. Set the token's name to the lowercase version of the current
+ // input character (add 0x0020 to the character's code point).
this.create(DOCTYPEToken.createWithName(this.currentInputCharacter.toLowerCase()).startingAt(this.currentPosition));
+
+ // Switch to the DOCTYPE name state.
this.state = State.DOCTYPEName;
break;
}
+ // Anything else
+ // Create a new DOCTYPE token. Set the token's name to the current input character.
this.create(DOCTYPEToken.createWithName(this.currentInputCharacter).startingAt(this.currentPosition));
+
+ // Switch to the DOCTYPE name state.
this.state = State.DOCTYPEName;
}
}
break;
}
+ // 13.2.5.55 DOCTYPE name state https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state
case State.DOCTYPEName: {
+ // Consume the next input character:
switch (this.consumeNext()) {
+ // U+0009 CHARACTER TABULATION (tab)
+ // U+000C FORM FEED (FF)
+ // U+000A LINE FEED (LF)
+ // U+0020 SPACE
case '\u0009':
case '\u000A':
case '\u000C':
- case '\u0020': this.state = State.AfterDOCTYPEName; break;
- case '\u003E': this.state = State.Data; this.emitCurrentOfType(DOCTYPEToken); break;
- case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(DOCTYPEToken).appendReplacementCharacterToName(); break;
- case undefined:
- this.parseError('eof-in-doctype');
- this.currentOfType(DOCTYPEToken).forceQuirks = true;
+ case '\u0020':
+ // Switch to the after DOCTYPE name state.
+ this.state = State.AfterDOCTYPEName;
+ break;
+ // U+003E GREATER-THAN SIGN (>)
+ case '\u003E':
+ // Switch to the data state.
+ this.state = State.Data;
+
+ // Emit the current DOCTYPE token.
this.emitCurrentOfType(DOCTYPEToken);
+ break;
+ // U+0000 NULL
+ case '\u0000':
+ // This is an unexpected-null-character parse error.
+ this.parseError('unexpected-null-character');
+
+ // Append a U+FFFD REPLACEMENT CHARACTER character to the current DOCTYPE token's name.
+ this.currentOfType(DOCTYPEToken).appendReplacementCharacterToName();
+ break;
+ // EOF
+ case undefined:
+ // This is an eof-in-doctype parse error.
+ this.parseError('eof-in-doctype');
+
+ // Set the current DOCTYPE token's force-quirks flag to on.
+ this.currentOfType(DOCTYPEToken).setForceQuirks();
+
+ // Emit the current DOCTYPE token.
+ this.emitCurrentOfType(DOCTYPEToken);
+
+ // Emit an end-of-file token.
this.emit(EndOfFileToken.create());
break;
default: {
+ // ASCII upper alpha
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
+ // Append the lowercase version of the current input character (add 0x0020 to the character's
+ // code point) to the current DOCTYPE token's name.
this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter.toLowerCase());
break;
}
+ // Anything else: Append the current input character to the current DOCTYPE token's name.
this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter);
}
}
break;
}
+ // 13.2.5.8 Tag name state https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state
case State.TagName: {
+ // Consume the next input character:
switch (this.consumeNext()) {
+ // U+0009 CHARACTER TABULATION (tab)
+ // U+000A LINE FEED (LF)
+ // U+000C FORM FEED (FF)
+ // U+0020 SPACE
case '\u0009':
case '\u000A':
case '\u000C':
- case '\u0020': this.state = State.BeforeAttributeName; break;
+ case '\u0020':
+ // Switch to the before attribute name state.
+ this.state = State.BeforeAttributeName;
+ break;
+ // U+002F SOLIDUS (/): Switch to the self-closing start tag state.
case '\u002F': this.state = State.SelfClosingStartTag; break;
- case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break;
+ // U+003E GREATER-THAN SIGN (>)
+ case '\u003E':
+ // Switch to the data state.
+ this.state = State.Data;
+
+ // Emit the current tag token.
+ this.emitCurrentOfEitherType(StartTagToken, EndTagToken);
+ break;
+ // U+0000 NULL
case '\u0000':
+ // This is an unexpected-null-character parse error.
this.parseError('unexpected-null-character');
+
+ // Append a U+FFFD REPLACEMENT CHARACTER character to the current tag token's tag name.
this.currentOfEitherType(StartTagToken, EndTagToken).appendReplacementCharacterToName();
break;
- case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
+ // EOF
+ case undefined:
+ // This is an eof-in-tag parse error.
+ this.parseError('eof-in-tag');
+
+ // Emit an end-of-file token.
+ this.emit(EndOfFileToken.create()); break;
default: {
+ // ASCII upper alpha
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
+ // Append the lowercase version of the current input character (add 0x0020 to the character's
+ // code point) to the current tag token's tag name.
this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter.toLowerCase());
break;
}
+ // Anything else: Append the current input character to the current tag token's tag name.
this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter);
}
}
break;
}
+ // 13.2.5.32 Before attribute name state https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state
case State.BeforeAttributeName: {
+ // Consume the next input character:
switch (this.consumeNext()) {
+ // U+0009 CHARACTER TABULATION (tab)
+ // U+000A LINE FEED (LF)
+ // U+000C FORM FEED (FF)
+ // U+0020 SPACE
case '\u0009':
case '\u000A':
case '\u000C':
- case '\u0020': break;
+ case '\u0020':
+ // Ignore the character.
+ break;
+ // U+002F SOLIDUS (/)
+ // U+003E GREATER-THAN SIGN (>)
+ // EOF
case '\u002F':
case '\u003E':
- case undefined: this.reconsumeIn(State.AfterAttributeName); break;
+ case undefined:
+ // Reconsume in the after attribute name state.
+ this.reconsumeIn(State.AfterAttributeName);
+ break;
+ // U+003D EQUALS SIGN (=)
case '\u003D': {
+ // This is an unexpected-equals-sign-before-attribute-name parse error.
this.parseError('unexpected-equals-sign-before-attribute-name');
+
+ // Start a new attribute in the current tag token. Set that attribute's name to
+ // the current input character, and its value to the empty string.
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyValue(this.currentInputCharacter).startingNameAt(this.currentPosition));
+
+ // Switch to the attribute name state.
this.state = State.AttributeName;
break;
}
+ // Anything else
default: {
+ // Start a new attribute in the current tag token. Set that attribute name and value to the empty string.
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyNameAndValue().startingNameAt(this.currentPosition));
+
+ // Reconsume in the attribute name state.
this.reconsumeIn(State.AttributeName);
}
}
break;
}
+ // 13.2.5.33 Attribute name state https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state
case State.AttributeName: {
+ // Consume the next input character:
switch (this.consumeNext()) {
- case '\u0009':
+ // U+000A LINE FEED (LF)
+ // U+000C FORM FEED (FF)
+ // U+0020 SPACE
+ // U+002F SOLIDUS (/)
+ // U+003E GREATER-THAN SIGN (>)
+ // EOF
+ // U+0009 CHARACTER TABULATION (tab)
case '\u000A':
case '\u000C':
case '\u0020':
case '\u002F':
case '\u003E':
- case undefined: this.reconsumeIn(State.AfterAttributeName); break;
+ case undefined:
+ case '\u0009':
+ // Reconsume in the after attribute name state.
+ this.reconsumeIn(State.AfterAttributeName);
+ break;
+ // U+003D EQUALS SIGN (=)
case '\u003D':
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.endingNameAt(this.currentPosition.copy().decrement(1));
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.equalsAt(this.currentPosition);
+
+ // Switch to the before attribute value state.
this.state = State.BeforeAttributeValue;
break;
- case '\u0000': this.parseError('unexpected-null-character');
+ // U+0000 NULL
+ case '\u0000':
+ // This is an unexpected-null-character parse error.
+ this.parseError('unexpected-null-character');
+
+ // Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's name.
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToName();
break;
+ // U+0022 QUOTATION MARK (")
+ // U+0027 APOSTROPHE (')
+ // U+003C LESS-THAN SIGN (<)
case '\u0022':
case '\u0027':
case '\u003C':
+ // This is an unexpected-character-in-attribute-name parse error.
this.parseError('unexpected-character-in-attribute-name');
+
+ // Treat it as per the "anything else" entry below.
+ // Append the current input character to the current attribute's name.
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter);
break;
default: {
+ // ASCII upper alpha
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
+ // Append the lowercase version of the current input character (add 0x0020 to the character's
+ // code point) to the current attribute's name.
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter.toLowerCase());
break;
}
+ // Append the current input character to the current attribute's name.
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter);
}
}
break;
}
+ // 13.2.5.34 After attribute name state https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state
case State.AfterAttributeName: {
+ // Consume the next input character:
switch (this.consumeNext()) {
+ // U+0009 CHARACTER TABULATION (tab)
+ // U+000A LINE FEED (LF)
+ // U+000C FORM FEED (FF)
+ // U+0020 SPACE
case '\u0009':
case '\u000A':
case '\u000C':
- case '\u0020': break;
+ case '\u0020':
+ // Ignore the character.
+ break;
+ // U+002F SOLIDUS (/)
case '\u002F':
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.endingNameAt(this.currentPosition);
+
+ // Switch to the self-closing start tag state.
this.state = State.SelfClosingStartTag;
break;
+ // U+003D EQUALS SIGN (=)
case '\u003D':
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.endingNameAt(this.currentPosition);
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.equalsAt(this.currentPosition);
+
+ // Switch to the before attribute value state.
this.state = State.BeforeAttributeValue;
break;
+ // U+003E GREATER-THAN SIGN (>)
case '\u003E':
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.endingNameAt(this.currentPosition);
+
+ // Switch to the data state.
this.state = State.Data;
+
+ // Emit the current tag token.
this.emitCurrentOfEitherType(StartTagToken, EndTagToken);
break;
- case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
+ // EOF
+ case undefined:
+ // This is an eof-in-tag parse error.
+ this.parseError('eof-in-tag');
+
+ // Emit an end-of-file token.
+ this.emit(EndOfFileToken.create());
+ break;
default:
+ // Anything else
+ // Start a new attribute in the current tag token. Set that attribute name and value to the empty string.
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyNameAndValue().startingNameAt(this.currentPosition));
+
+ // Reconsume in the attribute name state.
this.reconsumeIn(State.AttributeName);
break;
}
break;
}
+ // 13.2.5.35 Before attribute value state https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state
case State.BeforeAttributeValue: {
+ // Consume the next input character:
switch (this.consumeNext()) {
+ // U+0009 CHARACTER TABULATION (tab)
+ // U+000A LINE FEED (LF)
+ // U+000C FORM FEED (FF)
+ // U+0020 SPACE
case '\u0009':
case '\u000A':
case '\u000C':
- case '\u0020': break;
+ case '\u0020':
+ // Ignore the character.
+ break;
+ // U+0022 QUOTATION MARK (")
case '\u0022':
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.startingValueAt(this.currentPosition);
+
+ // Switch to the attribute value (double-quoted) state.
this.state = State.AttributeValueDouble;
break;
+ // U+0027 APOSTROPHE (')
case '\u0027':
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.startingValueAt(this.currentPosition);
+
+ // Switch to the attribute value (single-quoted) state.
this.state = State.AttributeValueSingle;
break;
+ // U+003E GREATER-THAN SIGN (>)
case '\u003E':
- this.parseError('missing-attribute-value');
- this.state = State.Data;
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.endingNameAt(this.currentPosition);
+
+ // This is a missing-attribute-value parse error.
+ this.parseError('missing-attribute-value');
+
+ // Switch to the data state.
+ this.state = State.Data;
+
+ // Emit the current tag token.
this.emitCurrentOfEitherType(StartTagToken, EndTagToken);
break;
default:
+ // Anything else
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.startingValueAt(this.currentPosition);
+
+ // Reconsume in the attribute value (unquoted) state.
this.reconsumeIn(State.AttributeValueUnquoted);
}
break;
}
+ // 13.2.5.36 Attribute value (double-quoted) state https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state
case State.AttributeValueDouble: {
+ // Consume the next input character:
switch (this.consumeNext()) {
+ // U+0022 QUOTATION MARK (")
case '\u0022':
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.endingValueAt(this.currentPosition);
+
+ // Switch to the after attribute value (quoted) state.
this.state = State.AfterAttributeValue;
break;
- case '\u0026': this.returnState = State.AttributeValueDouble; this.state = State.CharacterReference; break;
+ // U+0026 AMPERSAND (&)
+ case '\u0026':
+ // Set the return state to the attribute value (double-quoted) state.
+ this.returnState = State.AttributeValueDouble;
+
+ // Switch to the character reference state.
+ this.state = State.CharacterReference;
+ break;
+ // U+0000 NULL
case '\u0000':
+ // This is an unexpected-null-character parse error.
this.parseError('unexpected-null-character');
+
+ // Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's value.
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue();
break;
- case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
+ // EOF
+ case undefined:
+ // This is an eof-in-tag parse error.
+ this.parseError('eof-in-tag');
+
+ // Emit an end-of-file token.
+ this.emit(EndOfFileToken.create());
+ break;
+ // Anything else: Append the current input character to the current attribute's value.
default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
}
break;
}
+ // 13.2.5.37 Attribute value (single-quoted) state https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state
case State.AttributeValueSingle: {
+ // Consume the next input character:
switch (this.consumeNext()) {
+ // U+0027 APOSTROPHE (')
case '\u0027':
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.endingValueAt(this.currentPosition);
+
+ // Switch to the after attribute value (quoted) state.
this.state = State.AfterAttributeValue;
break;
- case '\u0026': this.returnState = State.AttributeValueSingle; this.state = State.CharacterReference; break;
+ // U+0026 AMPERSAND (&)
+ case '\u0026':
+ // Set the return state to the attribute value (single-quoted) state.
+ this.returnState = State.AttributeValueSingle;
+
+ // Switch to the character reference state.
+ this.state = State.CharacterReference;
+ break;
+ // U+0000 NULL
case '\u0000':
+ // This is an unexpected-null-character parse error.
this.parseError('unexpected-null-character');
+
+ // Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's value.
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue();
break;
- case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
+ // EOF
+ case undefined:
+ // This is an eof-in-tag parse error.
+ this.parseError('eof-in-tag');
+
+ // Emit an end-of-file token.
+ this.emit(EndOfFileToken.create()); break;
+ // Anything else: Append the current input character to the current attribute's value.
default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
}
break;
}
+ // 13.2.5.38 Attribute value (unquoted) state https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted)-state
case State.AttributeValueUnquoted: {
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.setUnquoted();
+ // Consume the next input character:
switch (this.consumeNext()) {
+ // U+0009 CHARACTER TABULATION (tab)
+ // U+000A LINE FEED (LF)
+ // U+000C FORM FEED (FF)
+ // U+0020 SPACE
case '\u0009':
case '\u000A':
case '\u000C':
- case '\u0020': this.state = State.BeforeAttributeName; break;
- case '\u0026': this.returnState = State.AttributeValueUnquoted; this.state = State.CharacterReference; break;
+ case '\u0020':
+ // Switch to the before attribute name state.
+ this.state = State.BeforeAttributeName;
+ break;
+ // U+0026 AMPERSAND (&)
+ case '\u0026':
+ // Set the return state to the attribute value (unquoted) state.
+ this.returnState = State.AttributeValueUnquoted;
+
+ // Switch to the character reference state.
+ this.state = State.CharacterReference;
+ break;
+ // U+003E GREATER-THAN SIGN (>)
case '\u003E':
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.endingValueAt(this.currentPosition);
+
+ // Switch to the data state.
this.state = State.Data;
+
+ // Emit the current tag token.
this.emitCurrentOfEitherType(StartTagToken, EndTagToken);
break;
+ // U+0000 NULL
case '\u0000':
+ // This is an unexpected-null-character parse error.
this.parseError('unexpected-null-character');
+
+ // Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's value.
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue();
break;
+ // U+0022 QUOTATION MARK (")
+ // U+0027 APOSTROPHE (')
+ // U+003C LESS-THAN SIGN (<)
+ // U+003D EQUALS SIGN (=)
+ // U+0060 GRAVE ACCENT (`)
case '\u0022':
case '\u0027':
case '\u003C':
case '\u003D':
case '\u0060':
+ // This is an unexpected-character-in-unquoted-attribute-value parse error.
this.parseError('unexpected-character-in-unquoted-attribute-value');
+
+ // Treat it as per the "anything else" entry below.
+ // Append the current input character to the current attribute's value.
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
break;
- case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
+ // EOF
+ case undefined:
+ // This is an eof-in-tag parse error.
+ this.parseError('eof-in-tag');
+
+ // Emit an end-of-file token.
+ this.emit(EndOfFileToken.create()); break;
+ // Anything else: Append the current input character to the current attribute's value.
default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
}
break;
}
+ // 13.2.5.39 After attribute value (quoted) state https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state
case State.AfterAttributeValue: {
+ // Consume the next input character:
switch (this.consumeNext()) {
+ // U+0009 CHARACTER TABULATION (tab)
+ // U+000A LINE FEED (LF)
+ // U+000C FORM FEED (FF)
+ // U+0020 SPACE
case '\u0009':
case '\u000A':
case '\u000C':
- case '\u0020': this.state = State.BeforeAttributeName; break;
+ case '\u0020':
+ // Switch to the before attribute name state.
+ this.state = State.BeforeAttributeName;
+ break;
+ // U+002F SOLIDUS (/): Switch to the self-closing start tag state.
case '\u002F': this.state = State.SelfClosingStartTag; break;
- case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break;
- case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break;
- default: this.parseError('missing-whitespace-between-attributes'); this.reconsumeIn(State.BeforeAttributeName);
+ // U+003E GREATER-THAN SIGN (>)
+ case '\u003E':
+ // Switch to the data state.
+ this.state = State.Data;
+
+ // Emit the current tag token.
+ this.emitCurrentOfEitherType(StartTagToken, EndTagToken);
+ break;
+ // EOF
+ case undefined:
+ // This is an eof-in-tag parse error.
+ this.parseError('eof-in-tag');
+
+ // Emit an end-of-file token.
+ this.emit(EndOfFileToken.create());
+ break;
+ // Anything else
+ default:
+ // This is a missing-whitespace-between-attributes parse error.
+ this.parseError('missing-whitespace-between-attributes');
+
+ // Reconsume in the before attribute name state.
+ this.reconsumeIn(State.BeforeAttributeName);
}
break;
}
+ // 13.2.5.43 Comment start state
case State.CommentStart: {
+ // Consume the next input character:
switch (this.consumeNext()) {
+ // U+002D HYPHEN-MINUS (-): Switch to the comment start dash state.
case '\u002D': this.state = State.CommentStartDash; break;
- case '\u003E': this.parseError('abrupt-closing-of-empty-comment'); this.state = State.Data; this.emitCurrentOfType(CommentToken); break;
+ // U+003E GREATER-THAN SIGN (>)
+ case '\u003E':
+ // This is an abrupt-closing-of-empty-comment parse error.
+ this.parseError('abrupt-closing-of-empty-comment');
+
+ // Switch to the data state.
+ this.state = State.Data;
+
+ // Emit the current comment token.
+ this.emitCurrentOfType(CommentToken);
+ break;
+ // Anything else: Reconsume in the comment state.
default: this.reconsumeIn(State.Comment);
}
break;
}
// FIXME: Possible improvement to https://html.spec.whatwg.org/multipage/parsing.html#comment-state (adding **current** in some places)
+ // 13.2.5.45 Comment state https://html.spec.whatwg.org/multipage/parsing.html#comment-state
case State.Comment: {
+ // Consume the next input character:
switch (this.consumeNext()) {
- case '\u003C': this.currentOfType(CommentToken).append(this.currentInputCharacter); this.state = State.CommentLessThanSign; break;
+ // U+003C LESS-THAN SIGN (<)
+ case '\u003C':
+ // Append the current input character to the comment token's data.
+ this.currentOfType(CommentToken).append(this.currentInputCharacter);
+
+ // Switch to the comment less-than sign state.
+ this.state = State.CommentLessThanSign;
+ break;
+ // U+002D HYPHEN-MINUS (-): Switch to the comment end dash state.
case '\u002D': this.state = State.CommentEndDash; break;
- case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(CommentToken).appendReplacementCharacter(); break;
- case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break;
+ // U+0000 NULL
+ case '\u0000':
+ // This is an unexpected-null-character parse error.
+ this.parseError('unexpected-null-character');
+
+ // Append a U+FFFD REPLACEMENT CHARACTER character to the comment token's data.
+ this.currentOfType(CommentToken).appendReplacementCharacter();
+ break;
+ // EOF
+ case undefined:
+ // This is an eof-in-comment parse error.
+ this.parseError('eof-in-comment');
+
+ // Emit the current comment token.
+ this.emitCurrentOfType(CommentToken);
+
+ // Emit an end-of-file token.
+ this.emit(EndOfFileToken.create());
+ break;
+ // Anything else: Append the current input character to the comment token's data.
default: this.currentOfType(CommentToken).append(this.currentInputCharacter);
}
break;
}
+ // 13.2.5.50 Comment end dash state https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state
case State.CommentEndDash: {
+ // Consume the next input character:
switch (this.consumeNext()) {
+ // U+002D HYPHEN-MINUS (-): Switch to the comment end state.
case '\u002D': this.state = State.CommentEnd; break;
- case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break;
- default: this.currentOfType(CommentToken).append('\u002D'); this.reconsumeIn(State.Comment);
+ // EOF
+ case undefined:
+ // This is an eof-in-comment parse error.
+ this.parseError('eof-in-comment');
+
+ // Emit the current comment token.
+ this.emitCurrentOfType(CommentToken);
+
+ // Emit an end-of-file token.
+ this.emit(EndOfFileToken.create());
+ break;
+ // Anything else
+ default:
+ // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
+ this.currentOfType(CommentToken).append('\u002D');
+
+ // Reconsume in the comment state.
+ this.reconsumeIn(State.Comment);
}
break;
}
// Same as above fixme https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
+ // 13.2.5.51 Comment end state https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state
case State.CommentEnd: {
+ // Consume the next input character:
switch (this.consumeNext()) {
- case '\u003E': this.state = State.Data; this.emit(this.currentOfType(CommentToken).endingAt(this.currentPosition.copy().increment(1))); break;
+ // U+003E GREATER-THAN SIGN (>)
+ case '\u003E':
+ // Switch to the data state.
+ this.state = State.Data;
+
+ // Emit the current comment token.
+ this.emit(this.currentOfType(CommentToken).endingAt(this.currentPosition.copy().increment(1)));
+ break;
+ // U+0021 EXCLAMATION MARK (!): Switch to the comment end bang state.
case '\u0021': this.state = State.CommentEndBang; break;
+ // U+002D HYPHEN-MINUS (-): Append a U+002D HYPHEN-MINUS character (-) to the comment token's data.
case '\u002D': this.currentOfType(CommentToken).append('\u002D'); break;
- case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break;
- default: this.currentOfType(CommentToken).append('\u002D\u002D'); this.reconsumeIn(State.Comment);
+ // EOF
+ case undefined:
+ // This is an eof-in-comment parse error.
+ this.parseError('eof-in-comment');
+
+ // Emit the current comment token.
+ this.emitCurrentOfType(CommentToken);
+
+ // Emit an end-of-file token.
+ this.emit(EndOfFileToken.create());
+ break;
+ // Anything else
+ default:
+ // Append two U+002D HYPHEN-MINUS characters (-) to the comment token's data.
+ this.currentOfType(CommentToken).append('\u002D\u002D');
+
+ // Reconsume in the comment state.
+ this.reconsumeIn(State.Comment);
}
break;
}
// Same as above https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
+ // 13.2.5.41 Bogus comment state https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state
case State.BogusComment: {
+ // Consume the next input character:
switch (this.consumeNext()) {
- case '\u003E': this.state = State.Data; this.emitCurrentOfType(CommentToken); break;
- case undefined: this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break;
- case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(CommentToken).appendReplacementCharacter(); break;
+ // U+003E GREATER-THAN SIGN (>)
+ case '\u003E':
+ // Switch to the data state.
+ this.state = State.Data;
+
+ // Emit the current comment token.
+ this.emitCurrentOfType(CommentToken);
+ break;
+ // EOF
+ case undefined:
+ // Emit the comment.
+ this.emitCurrentOfType(CommentToken);
+
+ // Emit an end-of-file token.
+ this.emit(EndOfFileToken.create());
+ break;
+ // U+0000 NULL
+ case '\u0000':
+ // This is an unexpected-null-character parse error.
+ this.parseError('unexpected-null-character');
+
+ // Append a U+FFFD REPLACEMENT CHARACTER character to the comment token's data.
+ this.currentOfType(CommentToken).appendReplacementCharacter();
+ break;
+ // Anything else: Append the current input character to the comment token's data.
default: this.currentOfType(CommentToken).append(this.currentInputCharacter);
}
break;
}
+ // 13.2.5.72 Character reference state https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state
case State.CharacterReference: {
+ // Set the temporary buffer to the empty string.
this.temporaryBuffer = '';
+
+ // Append a U+0026 AMPERSAND (&) character to the temporary buffer.
this.temporaryBuffer += '\u0026';
+ // Consume the next input character:
switch (this.consumeNext()) {
- case '\u0023': this.temporaryBuffer += this.currentInputCharacter; this.state = State.NumericCharacterReference; break;
+ // U+0023 NUMBER SIGN (#)
+ case '\u0023':
+ // Append the current input character to the temporary buffer.
+ this.temporaryBuffer += this.currentInputCharacter;
+
+ // Switch to the numeric character reference state.
+ this.state = State.NumericCharacterReference;
+ break;
default: {
+ // ASCII alphanumeric
if (this.asciiAlphanumeric(this.currentInputCharacter)) {
+ // Reconsume in the named character reference state.
this.reconsumeIn(State.NamedCharacterReference);
break;
}
+ // Anything else
+ // Flush code points consumed as a character reference.
this.flushCodePointsConsumedAsCharacterReference();
+
+ // Reconsume in the return state.
this.reconsumeIn(this.returnState);
}
}
break;
}
+ // 13.2.5.73 Named character reference state https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state
case State.NamedCharacterReference: {
let match = false;
+ // Consume the maximum number of characters possible, where the consumed characters are one of the
+ // identifiers in the first column of the named character references table. Append each character to the
+ // temporary buffer when it's consumed.
+
+ // NOTE: entities are sorted by length, long first
for (const entry in entities) {
if (this.matchNextFew(entry)) {
+ // If there is a match
match = true;
this.consumeNextFew(entry);
this.temporaryBuffer += entry;
+ // If the character reference was consumed as part of an attribute, and the last character matched
+ // is not a U+003B SEMICOLON character (;), and the next input character is either a U+003D
+ // EQUALS SIGN character (=) or an ASCII alphanumeric, then, for historical reasons,
if (this.consumedAsPartOfAnAttribute() && entry[entry.length - 1] !== '\u003B' && (this.next() === '\u003D' || this.asciiAlphanumeric(this.next() ?? ''))) {
+ // flush code points consumed as a character reference and
this.flushCodePointsConsumedAsCharacterReference();
+
+ // switch to the return state.
this.state = this.returnState;
break;
}
+ // Otherwise:
+ // 1. If the last character matched is not a U+003B SEMICOLON character (;), then this is a
+ // missing-semicolon-after-character-reference parse error.
if (entry[entry.length - 1] !== '\u003B')
this.parseError('missing-semicolon-after-character-reference');
+ // 2. Set the temporary buffer to the empty string.
this.temporaryBuffer = '';
+
+ // Append one or two characters corresponding to the character reference name (as given by the second
+ // column of the named character references table) to the temporary buffer.
this.temporaryBuffer += entities[entry].characters;
+
+ // 3. Flush code points consumed as a character reference.
this.flushCodePointsConsumedAsCharacterReference();
+
+ // Switch to the return state.
this.state = this.returnState;
break;
}
}
+ // Otherwise
if (!match) {
+ // Flush code points consumed as a character reference.
this.flushCodePointsConsumedAsCharacterReference();
+
+ // Switch to the ambiguous ampersand state.
this.state = State.AmbiguousAmpersand;
}
break;
}
+ // 13.2.5.74 Ambiguous ampersand state
case State.AmbiguousAmpersand: {
+ // Consume the next input character:
switch (this.consumeNext()) {
- case '\u003B': this.parseError('unknown-named-character-reference'); this.reconsumeIn(this.returnState); break;
+ // U+003B SEMICOLON (;)
+ case '\u003B':
+ // This is an unknown-named-character-reference parse error.
+ this.parseError('unknown-named-character-reference');
+
+ // Reconsume in the return state.
+ this.reconsumeIn(this.returnState); break;
default: {
+ // ASCII alphanumeric
if (this.asciiAlphanumeric(this.currentInputCharacter)) {
+ // If the character reference was consumed as part of an attribute,
if (this.consumedAsPartOfAnAttribute()) {
+ // then append the current input character to the current attribute's value.
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter);
+ // Otherwise,
} else {
+ // emit the current input character as a character token.
this.emit(CharacterToken.createWith(this.currentInputCharacter));
}
break;
}
+ // Anything else: Reconsume in the return state.
this.reconsumeIn(this.returnState);
}
}
@@ -560,12 +1204,15 @@ export class Tokenizer {
}
}
+ // flush code points consumed as a character reference https://html.spec.whatwg.org/multipage/parsing.html#flush-code-points-consumed-as-a-character-reference
private flushCodePointsConsumedAsCharacterReference(): void {
+ // append the code point from the buffer to the current attribute's value if the character reference was consumed as part of an attribute,
if (this.consumedAsPartOfAnAttribute()) {
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.temporaryBuffer);
return;
}
+ // or emit the code point as a character token otherwise.
for (const codePoint of this.temporaryBuffer)
this.emit(CharacterToken.createWith(codePoint));
}
diff --git a/src/html/tokenizer/tokens/doctype.ts b/src/html/tokenizer/tokens/doctype.ts
index a041ae7..8902779 100644
--- a/src/html/tokenizer/tokens/doctype.ts
+++ b/src/html/tokenizer/tokens/doctype.ts
@@ -26,6 +26,10 @@ export class DOCTYPEToken extends Token {
this.appendToName(REPLACEMENT_CHARACTER);
}
+ public setForceQuirks(): void {
+ this.forceQuirks = true;
+ }
+
public static createWithForcedQuirks(): DOCTYPEToken {
return new DOCTYPEToken(undefined, undefined, undefined, true);
}
@@ -34,6 +38,10 @@ export class DOCTYPEToken extends Token {
return new DOCTYPEToken(name, undefined, undefined, undefined);
}
+ public static createWithReplacementCharacter(): DOCTYPEToken {
+ return new DOCTYPEToken(REPLACEMENT_CHARACTER, undefined, undefined, undefined);
+ }
+
public override inspect(indent: number): string {
return `DOCTYPEToken { '${this.name}' }`;
}