diff --git a/src/html/errors.ts b/src/html/errors.ts index f240d63..6289d26 100644 --- a/src/html/errors.ts +++ b/src/html/errors.ts @@ -17,4 +17,5 @@ export type ParseError = 'unexpected-null-character' | 'abrupt-closing-of-empty-comment' | 'eof-in-comment' | 'missing-semicolon-after-character-reference' | - 'unknown-named-character-reference'; + 'unknown-named-character-reference' | + 'missing-doctype-name'; diff --git a/src/html/tokenizer.ts b/src/html/tokenizer.ts index 817ef86..efc30fe 100644 --- a/src/html/tokenizer.ts +++ b/src/html/tokenizer.ts @@ -32,524 +32,1168 @@ export class Tokenizer { public spin(): void { switch (this.state) { + // 13.2.5.1 Data state https://html.spec.whatwg.org/multipage/parsing.html#data-state case State.Data: { + // Consume the next input character: switch (this.consumeNext()) { + // U+0026 AMPERSAND (&) case '\u0026': + // Set the return state to the data state. this.returnState = State.Data; + + // Switch to the character reference state. this.state = State.CharacterReference; break; + // U+003C LESS-THAN SIGN (<): Switch to the tag open state. case '\u003C': this.state = State.TagOpen; break; + // U+0000 NULL case '\u0000': + // This is an unexpected-null-character parse error. this.parseError('unexpected-null-character'); + + // Emit the current input character as a character token. this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition)); break; + // EOF: Emit an end-of-file token. case undefined: this.emit(EndOfFileToken.create()); break; + // Anything else: Emit the current input character as a character token. default: this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition)); } break; } + // 13.2.5.2 RCDATA state https://html.spec.whatwg.org/multipage/parsing.html#rcdata-state case State.RCDATA: { + // Consume the next input character: switch (this.consumeNext()) { - case '\u003C': this.state = State.RAWTEXTLessThan; break; - case '\u0000': this.parseError('unexpected-null-character'); this.emit(CharacterToken.createReplacementCharacter().at(this.currentPosition)); break; + // U+0026 AMPERSAND (&) + case '\u0026': + // Set the return state to the RCDATA state. + this.returnState = State.RCDATA; + + // Switch to the character reference state. + this.state = State.CharacterReference; + break; + // U+003C LESS-THAN SIGN (<): Switch to the RCDATA less-than sign state. + case '\u003C': this.state = State.RCDATALessThanSign; break; + // U+0000 NULL + case '\u0000': + // This is an unexpected-null-character parse error. + this.parseError('unexpected-null-character'); + + // Emit a U+FFFD REPLACEMENT CHARACTER character token. + this.emit(CharacterToken.createReplacementCharacter().at(this.currentPosition)); + break; + // EOF: Emit an end-of-file token. case undefined: this.emit(EndOfFileToken.create()); break; + // Anything else: Emit the current input character as a character token. default: this.emit(CharacterToken.createWith(this.currentInputCharacter).at(this.currentPosition)); } break; } + // 13.2.5.6 Tag open state https://html.spec.whatwg.org/multipage/parsing.html#tag-open-state case State.TagOpen: { + // Consume the next input character: switch (this.consumeNext()) { + // U+0021 EXCLAMATION MARK (!): Switch to the markup declaration open state. case '\u0021': this.state = State.MarkupDeclarationOpen; break; + // U+002F SOLIDUS (/): Switch to the end tag open state. case '\u002F': this.state = State.EndTagOpen; break; + // U+003F QUESTION MARK (?) case '\u003F': + // This is an unexpected-question-mark-instead-of-tag-name parse error. this.parseError('unexpected-question-mark-instead-of-tag-name'); + + // Create a comment token whose data is the empty string. this.create(CommentToken.createEmpty().startingAt(this.currentPosition)); + + // Reconsume in the bogus comment state. this.reconsumeIn(State.BogusComment); break; + // EOF case undefined: + // This is an eof-before-tag-name parse error. this.parseError('eof-before-tag-name'); + + // Emit a U+003C LESS-THAN SIGN character token this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition)); + + // and an end-of-file token. this.emit(EndOfFileToken.create()); break; default: { + // ASCII alpha if (this.asciiAlpha(this.currentInputCharacter)) { + // Create a new start tag token, set its tag name to the empty string. this.create(StartTagToken.createEmpty().startingAt(this.currentPosition)); + + // Reconsume in the tag name state. this.reconsumeIn(State.TagName); break; } + // Anything else + // This is an invalid-first-character-of-tag-name parse error. this.parseError('invalid-first-character-of-tag-name'); + + // Emit a U+003C LESS-THAN SIGN character token. this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition)); + + // Reconsume in the data state. this.reconsumeIn(State.Data); } } break; } + // 13.2.5.7 End tag open state https://html.spec.whatwg.org/multipage/parsing.html#end-tag-open-state case State.EndTagOpen: { + // Consume the next input character: switch (this.consumeNext()) { - case '\u003E': this.parseError('missing-end-tag-name'); this.state = State.Data; break; + // U+003E GREATER-THAN SIGN (>) + case '\u003E': + // This is a missing-end-tag-name parse error. + this.parseError('missing-end-tag-name'); + + // Switch to the data state. + this.state = State.Data; + break; + // EOF case undefined: + // This is an eof-before-tag-name parse error. this.parseError('eof-before-tag-name'); + + // Emit a U+003C LESS-THAN SIGN character token, this.emit(CharacterToken.createWith('\u003C').at(this.currentPosition)); + + // a U+002F SOLIDUS character token this.emit(CharacterToken.createWith('\u002F').at(this.currentPosition)); + + // and an end-of-file token. this.emit(EndOfFileToken.create()); break; default: { + // ASCII alpha if (this.asciiAlpha(this.currentInputCharacter)) { + // Create a new end tag token, set its tag name to the empty string. this.create(EndTagToken.createEmpty().startingAt(this.currentPosition)); + + // Reconsume in the tag name state. this.reconsumeIn(State.TagName); break; } + // Anything else + // This is an invalid-first-character-of-tag-name parse error. this.parseError('invalid-first-character-of-tag-name'); + + // Create a comment token whose data is the empty string. this.create(CommentToken.createEmpty().startingAt(this.currentPosition)); + + // Reconsume in the bogus comment state. this.reconsumeIn(State.BogusComment); } } break; } + // 13.2.5.42 Markup declaration open state https://html.spec.whatwg.org/multipage/parsing.html#markup-declaration-open-state case State.MarkupDeclarationOpen: { + // If the next few characters are: + // Two U+002D HYPHEN-MINUS characters (-) if (this.matchNextFew('--')) { + // Consume those two characters, this.consumeNextFew('--'); + + // create a comment token whose data is the empty string, this.create(CommentToken.createEmpty().startingAt(this.currentPosition.copy().decrement(4))); + + // and switch to the comment start state. this.state = State.CommentStart; + // ASCII case-insensitive match for the word "DOCTYPE" } else if (this.matchNextFewCaseInsensitive('DOCTYPE')) { + // Consume those characters this.consumeNextFewCaseInsensitive('DOCTYPE'); + + // and switch to the DOCTYPE state. this.state = State.DOCTYPE; + // The string "[CDATA[" (the five uppercase letters "CDATA" with a U+005B LEFT SQUARE BRACKET character before and after) } else if (this.matchNextFew('[CDATA[')) { + // Consume those characters this.consumeNextFew('[CDATA['); // NOTE: This parser will never be generated as part of the fragment parsing algorithm, as such the CDATA section state does not // exist and will not be started here. + + // This is a cdata-in-html-content parse error. this.parseError('cdata-in-html-content'); + + // Create a comment token whose data is the "[CDATA[" string. this.create(CommentToken.createWith('[CDATA[').startingAt(this.currentPosition)); + + // Switch to the bogus comment state. this.state = State.BogusComment; + // Anything else } else { + // This is an incorrectly-opened-comment parse error. this.parseError('incorrectly-opened-comment'); + + // Create a comment token whose data is the empty string. this.create(CommentToken.createEmpty().startingAt(this.currentPosition)); + + // Switch to the bogus comment state (don't consume anything in the current state). this.state = State.BogusComment; } break; } + // 13.2.5.53 DOCTYPE state https://html.spec.whatwg.org/multipage/parsing.html#doctype-state case State.DOCTYPE: { + // Consume the next input character: switch (this.consumeNext()) { + // U+0009 CHARACTER TABULATION (tab) + // U+000A LINE FEED (LF) + // U+000C FORM FEED (FF) + // U+0020 SPACE case '\u0009': case '\u000A': case '\u000C': - case '\u0020': this.state = State.BeforeDOCTYPEName; break; - case '\u003E': this.reconsumeIn(State.BeforeDOCTYPEName); break; + case '\u0020': + // Switch to the before DOCTYPE name state. + this.state = State.BeforeDOCTYPEName; + break; + // U+003E GREATER-THAN SIGN (>) + case '\u003E': + // Reconsume in the before DOCTYPE name state. + this.reconsumeIn(State.BeforeDOCTYPEName); + break; + // EOF case undefined: + // This is an eof-in-doctype parse error. this.parseError('eof-in-doctype'); + + // Create a new DOCTYPE token. Set its force-quirks flag to on. Emit the current token. this.emit(DOCTYPEToken.createWithForcedQuirks().at(this.currentPosition)); + + // Emit an end-of-file token. this.emit(EndOfFileToken.create()); break; + // Anything else default: + // This is a missing-whitespace-before-doctype-name parse error. this.parseError('missing-whitespace-before-doctype-name'); + + // Reconsume in the before DOCTYPE name state. this.reconsumeIn(State.BeforeDOCTYPEName); } break; } + // 13.2.5.54 Before DOCTYPE name state https://html.spec.whatwg.org/multipage/parsing.html#before-doctype-name-state case State.BeforeDOCTYPEName: { + // Consume the next input character: switch (this.consumeNext()) { + // U+0009 CHARACTER TABULATION (tab) + // U+000A LINE FEED (LF) + // U+000C FORM FEED (FF) + // U+0020 SPACE case '\u0009': case '\u000A': case '\u000C': - case '\u0020': break; + case '\u0020': + // Ignore the character. + break; + // U+0000 NULL case '\u0000': + // This is an unexpected-null-character parse error. this.parseError('unexpected-null-character'); - this.create(DOCTYPEToken.createWithName('\uFFFD').startingAt(this.currentPosition)); + + // Create a new DOCTYPE token. Set the token's name to a U+FFFD REPLACEMENT CHARACTER character. + this.create(DOCTYPEToken.createWithReplacementCharacter().startingAt(this.currentPosition)); + + // Switch to the DOCTYPE name state. this.state = State.DOCTYPEName; break; + // U+003E GREATER-THAN SIGN (>) + case '\u003E': + // This is a missing-doctype-name parse error. + this.parseError('missing-doctype-name'); + + // Create a new DOCTYPE token. Set its force-quirks flag to on. + this.create(DOCTYPEToken.createWithForcedQuirks().at(this.currentPosition)); + + // Switch to the data state. + this.state = State.Data; + + // Emit the current token. + this.emitCurrentOfType(DOCTYPEToken); + break; + // EOF case undefined: + // This is an eof-in-doctype parse error. this.parseError('eof-in-doctype'); + + // Create a new DOCTYPE token. Set its force-quirks flag to on. Emit the current token. this.emit(DOCTYPEToken.createWithForcedQuirks().at(this.currentPosition)); + + // Emit an end-of-file token. this.emit(EndOfFileToken.create()); break; default: { + // ASCII upper alpha if (this.asciiUpperAlpha(this.currentInputCharacter)) { + // Create a new DOCTYPE token. Set the token's name to the lowercase version of the current + // input character (add 0x0020 to the character's code point). this.create(DOCTYPEToken.createWithName(this.currentInputCharacter.toLowerCase()).startingAt(this.currentPosition)); + + // Switch to the DOCTYPE name state. this.state = State.DOCTYPEName; break; } + // Anything else + // Create a new DOCTYPE token. Set the token's name to the current input character. this.create(DOCTYPEToken.createWithName(this.currentInputCharacter).startingAt(this.currentPosition)); + + // Switch to the DOCTYPE name state. this.state = State.DOCTYPEName; } } break; } + // 13.2.5.55 DOCTYPE name state https://html.spec.whatwg.org/multipage/parsing.html#doctype-name-state case State.DOCTYPEName: { + // Consume the next input character: switch (this.consumeNext()) { + // U+0009 CHARACTER TABULATION (tab) + // U+000C FORM FEED (FF) + // U+000A LINE FEED (LF) + // U+0020 SPACE case '\u0009': case '\u000A': case '\u000C': - case '\u0020': this.state = State.AfterDOCTYPEName; break; - case '\u003E': this.state = State.Data; this.emitCurrentOfType(DOCTYPEToken); break; - case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(DOCTYPEToken).appendReplacementCharacterToName(); break; - case undefined: - this.parseError('eof-in-doctype'); - this.currentOfType(DOCTYPEToken).forceQuirks = true; + case '\u0020': + // Switch to the after DOCTYPE name state. + this.state = State.AfterDOCTYPEName; + break; + // U+003E GREATER-THAN SIGN (>) + case '\u003E': + // Switch to the data state. + this.state = State.Data; + + // Emit the current DOCTYPE token. this.emitCurrentOfType(DOCTYPEToken); + break; + // U+0000 NULL + case '\u0000': + // This is an unexpected-null-character parse error. + this.parseError('unexpected-null-character'); + + // Append a U+FFFD REPLACEMENT CHARACTER character to the current DOCTYPE token's name. + this.currentOfType(DOCTYPEToken).appendReplacementCharacterToName(); + break; + // EOF + case undefined: + // This is an eof-in-doctype parse error. + this.parseError('eof-in-doctype'); + + // Set the current DOCTYPE token's force-quirks flag to on. + this.currentOfType(DOCTYPEToken).setForceQuirks(); + + // Emit the current DOCTYPE token. + this.emitCurrentOfType(DOCTYPEToken); + + // Emit an end-of-file token. this.emit(EndOfFileToken.create()); break; default: { + // ASCII upper alpha if (this.asciiUpperAlpha(this.currentInputCharacter)) { + // Append the lowercase version of the current input character (add 0x0020 to the character's + // code point) to the current DOCTYPE token's name. this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter.toLowerCase()); break; } + // Anything else: Append the current input character to the current DOCTYPE token's name. this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter); } } break; } + // 13.2.5.8 Tag name state https://html.spec.whatwg.org/multipage/parsing.html#tag-name-state case State.TagName: { + // Consume the next input character: switch (this.consumeNext()) { + // U+0009 CHARACTER TABULATION (tab) + // U+000A LINE FEED (LF) + // U+000C FORM FEED (FF) + // U+0020 SPACE case '\u0009': case '\u000A': case '\u000C': - case '\u0020': this.state = State.BeforeAttributeName; break; + case '\u0020': + // Switch to the before attribute name state. + this.state = State.BeforeAttributeName; + break; + // U+002F SOLIDUS (/): Switch to the self-closing start tag state. case '\u002F': this.state = State.SelfClosingStartTag; break; - case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break; + // U+003E GREATER-THAN SIGN (>) + case '\u003E': + // Switch to the data state. + this.state = State.Data; + + // Emit the current tag token. + this.emitCurrentOfEitherType(StartTagToken, EndTagToken); + break; + // U+0000 NULL case '\u0000': + // This is an unexpected-null-character parse error. this.parseError('unexpected-null-character'); + + // Append a U+FFFD REPLACEMENT CHARACTER character to the current tag token's tag name. this.currentOfEitherType(StartTagToken, EndTagToken).appendReplacementCharacterToName(); break; - case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; + // EOF + case undefined: + // This is an eof-in-tag parse error. + this.parseError('eof-in-tag'); + + // Emit an end-of-file token. + this.emit(EndOfFileToken.create()); break; default: { + // ASCII upper alpha if (this.asciiUpperAlpha(this.currentInputCharacter)) { + // Append the lowercase version of the current input character (add 0x0020 to the character's + // code point) to the current tag token's tag name. this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter.toLowerCase()); break; } + // Anything else: Append the current input character to the current tag token's tag name. this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter); } } break; } + // 13.2.5.32 Before attribute name state https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-name-state case State.BeforeAttributeName: { + // Consume the next input character: switch (this.consumeNext()) { + // U+0009 CHARACTER TABULATION (tab) + // U+000A LINE FEED (LF) + // U+000C FORM FEED (FF) + // U+0020 SPACE case '\u0009': case '\u000A': case '\u000C': - case '\u0020': break; + case '\u0020': + // Ignore the character. + break; + // U+002F SOLIDUS (/) + // U+003E GREATER-THAN SIGN (>) + // EOF case '\u002F': case '\u003E': - case undefined: this.reconsumeIn(State.AfterAttributeName); break; + case undefined: + // Reconsume in the after attribute name state. + this.reconsumeIn(State.AfterAttributeName); + break; + // U+003D EQUALS SIGN (=) case '\u003D': { + // This is an unexpected-equals-sign-before-attribute-name parse error. this.parseError('unexpected-equals-sign-before-attribute-name'); + + // Start a new attribute in the current tag token. Set that attribute's name to + // the current input character, and its value to the empty string. this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyValue(this.currentInputCharacter).startingNameAt(this.currentPosition)); + + // Switch to the attribute name state. this.state = State.AttributeName; break; } + // Anything else default: { + // Start a new attribute in the current tag token. Set that attribute name and value to the empty string. this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyNameAndValue().startingNameAt(this.currentPosition)); + + // Reconsume in the attribute name state. this.reconsumeIn(State.AttributeName); } } break; } + // 13.2.5.33 Attribute name state https://html.spec.whatwg.org/multipage/parsing.html#attribute-name-state case State.AttributeName: { + // Consume the next input character: switch (this.consumeNext()) { - case '\u0009': + // U+000A LINE FEED (LF) + // U+000C FORM FEED (FF) + // U+0020 SPACE + // U+002F SOLIDUS (/) + // U+003E GREATER-THAN SIGN (>) + // EOF + // U+0009 CHARACTER TABULATION (tab) case '\u000A': case '\u000C': case '\u0020': case '\u002F': case '\u003E': - case undefined: this.reconsumeIn(State.AfterAttributeName); break; + case undefined: + case '\u0009': + // Reconsume in the after attribute name state. + this.reconsumeIn(State.AfterAttributeName); + break; + // U+003D EQUALS SIGN (=) case '\u003D': this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.endingNameAt(this.currentPosition.copy().decrement(1)); this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.equalsAt(this.currentPosition); + + // Switch to the before attribute value state. this.state = State.BeforeAttributeValue; break; - case '\u0000': this.parseError('unexpected-null-character'); + // U+0000 NULL + case '\u0000': + // This is an unexpected-null-character parse error. + this.parseError('unexpected-null-character'); + + // Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's name. this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToName(); break; + // U+0022 QUOTATION MARK (") + // U+0027 APOSTROPHE (') + // U+003C LESS-THAN SIGN (<) case '\u0022': case '\u0027': case '\u003C': + // This is an unexpected-character-in-attribute-name parse error. this.parseError('unexpected-character-in-attribute-name'); + + // Treat it as per the "anything else" entry below. + // Append the current input character to the current attribute's name. this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter); break; default: { + // ASCII upper alpha if (this.asciiUpperAlpha(this.currentInputCharacter)) { + // Append the lowercase version of the current input character (add 0x0020 to the character's + // code point) to the current attribute's name. this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter.toLowerCase()); break; } + // Append the current input character to the current attribute's name. this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter); } } break; } + // 13.2.5.34 After attribute name state https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-name-state case State.AfterAttributeName: { + // Consume the next input character: switch (this.consumeNext()) { + // U+0009 CHARACTER TABULATION (tab) + // U+000A LINE FEED (LF) + // U+000C FORM FEED (FF) + // U+0020 SPACE case '\u0009': case '\u000A': case '\u000C': - case '\u0020': break; + case '\u0020': + // Ignore the character. + break; + // U+002F SOLIDUS (/) case '\u002F': this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.endingNameAt(this.currentPosition); + + // Switch to the self-closing start tag state. this.state = State.SelfClosingStartTag; break; + // U+003D EQUALS SIGN (=) case '\u003D': this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.endingNameAt(this.currentPosition); this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.equalsAt(this.currentPosition); + + // Switch to the before attribute value state. this.state = State.BeforeAttributeValue; break; + // U+003E GREATER-THAN SIGN (>) case '\u003E': this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.endingNameAt(this.currentPosition); + + // Switch to the data state. this.state = State.Data; + + // Emit the current tag token. this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break; - case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; + // EOF + case undefined: + // This is an eof-in-tag parse error. + this.parseError('eof-in-tag'); + + // Emit an end-of-file token. + this.emit(EndOfFileToken.create()); + break; default: + // Anything else + // Start a new attribute in the current tag token. Set that attribute name and value to the empty string. this.currentOfEitherType(StartTagToken, EndTagToken).attributes.append(Attribute.createWithEmptyNameAndValue().startingNameAt(this.currentPosition)); + + // Reconsume in the attribute name state. this.reconsumeIn(State.AttributeName); break; } break; } + // 13.2.5.35 Before attribute value state https://html.spec.whatwg.org/multipage/parsing.html#before-attribute-value-state case State.BeforeAttributeValue: { + // Consume the next input character: switch (this.consumeNext()) { + // U+0009 CHARACTER TABULATION (tab) + // U+000A LINE FEED (LF) + // U+000C FORM FEED (FF) + // U+0020 SPACE case '\u0009': case '\u000A': case '\u000C': - case '\u0020': break; + case '\u0020': + // Ignore the character. + break; + // U+0022 QUOTATION MARK (") case '\u0022': this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.startingValueAt(this.currentPosition); + + // Switch to the attribute value (double-quoted) state. this.state = State.AttributeValueDouble; break; + // U+0027 APOSTROPHE (') case '\u0027': this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.startingValueAt(this.currentPosition); + + // Switch to the attribute value (single-quoted) state. this.state = State.AttributeValueSingle; break; + // U+003E GREATER-THAN SIGN (>) case '\u003E': - this.parseError('missing-attribute-value'); - this.state = State.Data; this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.endingNameAt(this.currentPosition); + + // This is a missing-attribute-value parse error. + this.parseError('missing-attribute-value'); + + // Switch to the data state. + this.state = State.Data; + + // Emit the current tag token. this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break; default: + // Anything else this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.startingValueAt(this.currentPosition); + + // Reconsume in the attribute value (unquoted) state. this.reconsumeIn(State.AttributeValueUnquoted); } break; } + // 13.2.5.36 Attribute value (double-quoted) state https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(double-quoted)-state case State.AttributeValueDouble: { + // Consume the next input character: switch (this.consumeNext()) { + // U+0022 QUOTATION MARK (") case '\u0022': this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.endingValueAt(this.currentPosition); + + // Switch to the after attribute value (quoted) state. this.state = State.AfterAttributeValue; break; - case '\u0026': this.returnState = State.AttributeValueDouble; this.state = State.CharacterReference; break; + // U+0026 AMPERSAND (&) + case '\u0026': + // Set the return state to the attribute value (double-quoted) state. + this.returnState = State.AttributeValueDouble; + + // Switch to the character reference state. + this.state = State.CharacterReference; + break; + // U+0000 NULL case '\u0000': + // This is an unexpected-null-character parse error. this.parseError('unexpected-null-character'); + + // Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's value. this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue(); break; - case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; + // EOF + case undefined: + // This is an eof-in-tag parse error. + this.parseError('eof-in-tag'); + + // Emit an end-of-file token. + this.emit(EndOfFileToken.create()); + break; + // Anything else: Append the current input character to the current attribute's value. default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter); } break; } + // 13.2.5.37 Attribute value (single-quoted) state https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(single-quoted)-state case State.AttributeValueSingle: { + // Consume the next input character: switch (this.consumeNext()) { + // U+0027 APOSTROPHE (') case '\u0027': this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.endingValueAt(this.currentPosition); + + // Switch to the after attribute value (quoted) state. this.state = State.AfterAttributeValue; break; - case '\u0026': this.returnState = State.AttributeValueSingle; this.state = State.CharacterReference; break; + // U+0026 AMPERSAND (&) + case '\u0026': + // Set the return state to the attribute value (single-quoted) state. + this.returnState = State.AttributeValueSingle; + + // Switch to the character reference state. + this.state = State.CharacterReference; + break; + // U+0000 NULL case '\u0000': + // This is an unexpected-null-character parse error. this.parseError('unexpected-null-character'); + + // Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's value. this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue(); break; - case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; + // EOF + case undefined: + // This is an eof-in-tag parse error. + this.parseError('eof-in-tag'); + + // Emit an end-of-file token. + this.emit(EndOfFileToken.create()); break; + // Anything else: Append the current input character to the current attribute's value. default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter); } break; } + // 13.2.5.38 Attribute value (unquoted) state https://html.spec.whatwg.org/multipage/parsing.html#attribute-value-(unquoted)-state case State.AttributeValueUnquoted: { this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.setUnquoted(); + // Consume the next input character: switch (this.consumeNext()) { + // U+0009 CHARACTER TABULATION (tab) + // U+000A LINE FEED (LF) + // U+000C FORM FEED (FF) + // U+0020 SPACE case '\u0009': case '\u000A': case '\u000C': - case '\u0020': this.state = State.BeforeAttributeName; break; - case '\u0026': this.returnState = State.AttributeValueUnquoted; this.state = State.CharacterReference; break; + case '\u0020': + // Switch to the before attribute name state. + this.state = State.BeforeAttributeName; + break; + // U+0026 AMPERSAND (&) + case '\u0026': + // Set the return state to the attribute value (unquoted) state. + this.returnState = State.AttributeValueUnquoted; + + // Switch to the character reference state. + this.state = State.CharacterReference; + break; + // U+003E GREATER-THAN SIGN (>) case '\u003E': this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.endingValueAt(this.currentPosition); + + // Switch to the data state. this.state = State.Data; + + // Emit the current tag token. this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break; + // U+0000 NULL case '\u0000': + // This is an unexpected-null-character parse error. this.parseError('unexpected-null-character'); + + // Append a U+FFFD REPLACEMENT CHARACTER character to the current attribute's value. this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendReplacementCharacterToValue(); break; + // U+0022 QUOTATION MARK (") + // U+0027 APOSTROPHE (') + // U+003C LESS-THAN SIGN (<) + // U+003D EQUALS SIGN (=) + // U+0060 GRAVE ACCENT (`) case '\u0022': case '\u0027': case '\u003C': case '\u003D': case '\u0060': + // This is an unexpected-character-in-unquoted-attribute-value parse error. this.parseError('unexpected-character-in-unquoted-attribute-value'); + + // Treat it as per the "anything else" entry below. + // Append the current input character to the current attribute's value. this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter); break; - case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; + // EOF + case undefined: + // This is an eof-in-tag parse error. + this.parseError('eof-in-tag'); + + // Emit an end-of-file token. + this.emit(EndOfFileToken.create()); break; + // Anything else: Append the current input character to the current attribute's value. default: this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter); } break; } + // 13.2.5.39 After attribute value (quoted) state https://html.spec.whatwg.org/multipage/parsing.html#after-attribute-value-(quoted)-state case State.AfterAttributeValue: { + // Consume the next input character: switch (this.consumeNext()) { + // U+0009 CHARACTER TABULATION (tab) + // U+000A LINE FEED (LF) + // U+000C FORM FEED (FF) + // U+0020 SPACE case '\u0009': case '\u000A': case '\u000C': - case '\u0020': this.state = State.BeforeAttributeName; break; + case '\u0020': + // Switch to the before attribute name state. + this.state = State.BeforeAttributeName; + break; + // U+002F SOLIDUS (/): Switch to the self-closing start tag state. case '\u002F': this.state = State.SelfClosingStartTag; break; - case '\u003E': this.state = State.Data; this.emitCurrentOfEitherType(StartTagToken, EndTagToken); break; - case undefined: this.parseError('eof-in-tag'); this.emit(EndOfFileToken.create()); break; - default: this.parseError('missing-whitespace-between-attributes'); this.reconsumeIn(State.BeforeAttributeName); + // U+003E GREATER-THAN SIGN (>) + case '\u003E': + // Switch to the data state. + this.state = State.Data; + + // Emit the current tag token. + this.emitCurrentOfEitherType(StartTagToken, EndTagToken); + break; + // EOF + case undefined: + // This is an eof-in-tag parse error. + this.parseError('eof-in-tag'); + + // Emit an end-of-file token. + this.emit(EndOfFileToken.create()); + break; + // Anything else + default: + // This is a missing-whitespace-between-attributes parse error. + this.parseError('missing-whitespace-between-attributes'); + + // Reconsume in the before attribute name state. + this.reconsumeIn(State.BeforeAttributeName); } break; } + // 13.2.5.43 Comment start state case State.CommentStart: { + // Consume the next input character: switch (this.consumeNext()) { + // U+002D HYPHEN-MINUS (-): Switch to the comment start dash state. case '\u002D': this.state = State.CommentStartDash; break; - case '\u003E': this.parseError('abrupt-closing-of-empty-comment'); this.state = State.Data; this.emitCurrentOfType(CommentToken); break; + // U+003E GREATER-THAN SIGN (>) + case '\u003E': + // This is an abrupt-closing-of-empty-comment parse error. + this.parseError('abrupt-closing-of-empty-comment'); + + // Switch to the data state. + this.state = State.Data; + + // Emit the current comment token. + this.emitCurrentOfType(CommentToken); + break; + // Anything else: Reconsume in the comment state. default: this.reconsumeIn(State.Comment); } break; } // FIXME: Possible improvement to https://html.spec.whatwg.org/multipage/parsing.html#comment-state (adding **current** in some places) + // 13.2.5.45 Comment state https://html.spec.whatwg.org/multipage/parsing.html#comment-state case State.Comment: { + // Consume the next input character: switch (this.consumeNext()) { - case '\u003C': this.currentOfType(CommentToken).append(this.currentInputCharacter); this.state = State.CommentLessThanSign; break; + // U+003C LESS-THAN SIGN (<) + case '\u003C': + // Append the current input character to the comment token's data. + this.currentOfType(CommentToken).append(this.currentInputCharacter); + + // Switch to the comment less-than sign state. + this.state = State.CommentLessThanSign; + break; + // U+002D HYPHEN-MINUS (-): Switch to the comment end dash state. case '\u002D': this.state = State.CommentEndDash; break; - case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(CommentToken).appendReplacementCharacter(); break; - case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break; + // U+0000 NULL + case '\u0000': + // This is an unexpected-null-character parse error. + this.parseError('unexpected-null-character'); + + // Append a U+FFFD REPLACEMENT CHARACTER character to the comment token's data. + this.currentOfType(CommentToken).appendReplacementCharacter(); + break; + // EOF + case undefined: + // This is an eof-in-comment parse error. + this.parseError('eof-in-comment'); + + // Emit the current comment token. + this.emitCurrentOfType(CommentToken); + + // Emit an end-of-file token. + this.emit(EndOfFileToken.create()); + break; + // Anything else: Append the current input character to the comment token's data. default: this.currentOfType(CommentToken).append(this.currentInputCharacter); } break; } + // 13.2.5.50 Comment end dash state https://html.spec.whatwg.org/multipage/parsing.html#comment-end-dash-state case State.CommentEndDash: { + // Consume the next input character: switch (this.consumeNext()) { + // U+002D HYPHEN-MINUS (-): Switch to the comment end state. case '\u002D': this.state = State.CommentEnd; break; - case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break; - default: this.currentOfType(CommentToken).append('\u002D'); this.reconsumeIn(State.Comment); + // EOF + case undefined: + // This is an eof-in-comment parse error. + this.parseError('eof-in-comment'); + + // Emit the current comment token. + this.emitCurrentOfType(CommentToken); + + // Emit an end-of-file token. + this.emit(EndOfFileToken.create()); + break; + // Anything else + default: + // Append a U+002D HYPHEN-MINUS character (-) to the comment token's data. + this.currentOfType(CommentToken).append('\u002D'); + + // Reconsume in the comment state. + this.reconsumeIn(State.Comment); } break; } // Same as above fixme https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state + // 13.2.5.51 Comment end state https://html.spec.whatwg.org/multipage/parsing.html#comment-end-state case State.CommentEnd: { + // Consume the next input character: switch (this.consumeNext()) { - case '\u003E': this.state = State.Data; this.emit(this.currentOfType(CommentToken).endingAt(this.currentPosition.copy().increment(1))); break; + // U+003E GREATER-THAN SIGN (>) + case '\u003E': + // Switch to the data state. + this.state = State.Data; + + // Emit the current comment token. + this.emit(this.currentOfType(CommentToken).endingAt(this.currentPosition.copy().increment(1))); + break; + // U+0021 EXCLAMATION MARK (!): Switch to the comment end bang state. case '\u0021': this.state = State.CommentEndBang; break; + // U+002D HYPHEN-MINUS (-): Append a U+002D HYPHEN-MINUS character (-) to the comment token's data. case '\u002D': this.currentOfType(CommentToken).append('\u002D'); break; - case undefined: this.parseError('eof-in-comment'); this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break; - default: this.currentOfType(CommentToken).append('\u002D\u002D'); this.reconsumeIn(State.Comment); + // EOF + case undefined: + // This is an eof-in-comment parse error. + this.parseError('eof-in-comment'); + + // Emit the current comment token. + this.emitCurrentOfType(CommentToken); + + // Emit an end-of-file token. + this.emit(EndOfFileToken.create()); + break; + // Anything else + default: + // Append two U+002D HYPHEN-MINUS characters (-) to the comment token's data. + this.currentOfType(CommentToken).append('\u002D\u002D'); + + // Reconsume in the comment state. + this.reconsumeIn(State.Comment); } break; } // Same as above https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state + // 13.2.5.41 Bogus comment state https://html.spec.whatwg.org/multipage/parsing.html#bogus-comment-state case State.BogusComment: { + // Consume the next input character: switch (this.consumeNext()) { - case '\u003E': this.state = State.Data; this.emitCurrentOfType(CommentToken); break; - case undefined: this.emitCurrentOfType(CommentToken); this.emit(EndOfFileToken.create()); break; - case '\u0000': this.parseError('unexpected-null-character'); this.currentOfType(CommentToken).appendReplacementCharacter(); break; + // U+003E GREATER-THAN SIGN (>) + case '\u003E': + // Switch to the data state. + this.state = State.Data; + + // Emit the current comment token. + this.emitCurrentOfType(CommentToken); + break; + // EOF + case undefined: + // Emit the comment. + this.emitCurrentOfType(CommentToken); + + // Emit an end-of-file token. + this.emit(EndOfFileToken.create()); + break; + // U+0000 NULL + case '\u0000': + // This is an unexpected-null-character parse error. + this.parseError('unexpected-null-character'); + + // Append a U+FFFD REPLACEMENT CHARACTER character to the comment token's data. + this.currentOfType(CommentToken).appendReplacementCharacter(); + break; + // Anything else: Append the current input character to the comment token's data. default: this.currentOfType(CommentToken).append(this.currentInputCharacter); } break; } + // 13.2.5.72 Character reference state https://html.spec.whatwg.org/multipage/parsing.html#character-reference-state case State.CharacterReference: { + // Set the temporary buffer to the empty string. this.temporaryBuffer = ''; + + // Append a U+0026 AMPERSAND (&) character to the temporary buffer. this.temporaryBuffer += '\u0026'; + // Consume the next input character: switch (this.consumeNext()) { - case '\u0023': this.temporaryBuffer += this.currentInputCharacter; this.state = State.NumericCharacterReference; break; + // U+0023 NUMBER SIGN (#) + case '\u0023': + // Append the current input character to the temporary buffer. + this.temporaryBuffer += this.currentInputCharacter; + + // Switch to the numeric character reference state. + this.state = State.NumericCharacterReference; + break; default: { + // ASCII alphanumeric if (this.asciiAlphanumeric(this.currentInputCharacter)) { + // Reconsume in the named character reference state. this.reconsumeIn(State.NamedCharacterReference); break; } + // Anything else + // Flush code points consumed as a character reference. this.flushCodePointsConsumedAsCharacterReference(); + + // Reconsume in the return state. this.reconsumeIn(this.returnState); } } break; } + // 13.2.5.73 Named character reference state https://html.spec.whatwg.org/multipage/parsing.html#named-character-reference-state case State.NamedCharacterReference: { let match = false; + // Consume the maximum number of characters possible, where the consumed characters are one of the + // identifiers in the first column of the named character references table. Append each character to the + // temporary buffer when it's consumed. + + // NOTE: entities are sorted by length, long first for (const entry in entities) { if (this.matchNextFew(entry)) { + // If there is a match match = true; this.consumeNextFew(entry); this.temporaryBuffer += entry; + // If the character reference was consumed as part of an attribute, and the last character matched + // is not a U+003B SEMICOLON character (;), and the next input character is either a U+003D + // EQUALS SIGN character (=) or an ASCII alphanumeric, then, for historical reasons, if (this.consumedAsPartOfAnAttribute() && entry[entry.length - 1] !== '\u003B' && (this.next() === '\u003D' || this.asciiAlphanumeric(this.next() ?? ''))) { + // flush code points consumed as a character reference and this.flushCodePointsConsumedAsCharacterReference(); + + // switch to the return state. this.state = this.returnState; break; } + // Otherwise: + // 1. If the last character matched is not a U+003B SEMICOLON character (;), then this is a + // missing-semicolon-after-character-reference parse error. if (entry[entry.length - 1] !== '\u003B') this.parseError('missing-semicolon-after-character-reference'); + // 2. Set the temporary buffer to the empty string. this.temporaryBuffer = ''; + + // Append one or two characters corresponding to the character reference name (as given by the second + // column of the named character references table) to the temporary buffer. this.temporaryBuffer += entities[entry].characters; + + // 3. Flush code points consumed as a character reference. this.flushCodePointsConsumedAsCharacterReference(); + + // Switch to the return state. this.state = this.returnState; break; } } + // Otherwise if (!match) { + // Flush code points consumed as a character reference. this.flushCodePointsConsumedAsCharacterReference(); + + // Switch to the ambiguous ampersand state. this.state = State.AmbiguousAmpersand; } break; } + // 13.2.5.74 Ambiguous ampersand state case State.AmbiguousAmpersand: { + // Consume the next input character: switch (this.consumeNext()) { - case '\u003B': this.parseError('unknown-named-character-reference'); this.reconsumeIn(this.returnState); break; + // U+003B SEMICOLON (;) + case '\u003B': + // This is an unknown-named-character-reference parse error. + this.parseError('unknown-named-character-reference'); + + // Reconsume in the return state. + this.reconsumeIn(this.returnState); break; default: { + // ASCII alphanumeric if (this.asciiAlphanumeric(this.currentInputCharacter)) { + // If the character reference was consumed as part of an attribute, if (this.consumedAsPartOfAnAttribute()) { + // then append the current input character to the current attribute's value. this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.currentInputCharacter); + // Otherwise, } else { + // emit the current input character as a character token. this.emit(CharacterToken.createWith(this.currentInputCharacter)); } break; } + // Anything else: Reconsume in the return state. this.reconsumeIn(this.returnState); } } @@ -560,12 +1204,15 @@ export class Tokenizer { } } + // flush code points consumed as a character reference https://html.spec.whatwg.org/multipage/parsing.html#flush-code-points-consumed-as-a-character-reference private flushCodePointsConsumedAsCharacterReference(): void { + // append the code point from the buffer to the current attribute's value if the character reference was consumed as part of an attribute, if (this.consumedAsPartOfAnAttribute()) { this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToValue(this.temporaryBuffer); return; } + // or emit the code point as a character token otherwise. for (const codePoint of this.temporaryBuffer) this.emit(CharacterToken.createWith(codePoint)); } diff --git a/src/html/tokenizer/tokens/doctype.ts b/src/html/tokenizer/tokens/doctype.ts index a041ae7..8902779 100644 --- a/src/html/tokenizer/tokens/doctype.ts +++ b/src/html/tokenizer/tokens/doctype.ts @@ -26,6 +26,10 @@ export class DOCTYPEToken extends Token { this.appendToName(REPLACEMENT_CHARACTER); } + public setForceQuirks(): void { + this.forceQuirks = true; + } + public static createWithForcedQuirks(): DOCTYPEToken { return new DOCTYPEToken(undefined, undefined, undefined, true); } @@ -34,6 +38,10 @@ export class DOCTYPEToken extends Token { return new DOCTYPEToken(name, undefined, undefined, undefined); } + public static createWithReplacementCharacter(): DOCTYPEToken { + return new DOCTYPEToken(REPLACEMENT_CHARACTER, undefined, undefined, undefined); + } + public override inspect(indent: number): string { return `DOCTYPEToken { '${this.name}' }`; }