diff --git a/src/html/tokenizer.ts b/src/html/tokenizer.ts index efc30fe..14de417 100644 --- a/src/html/tokenizer.ts +++ b/src/html/tokenizer.ts @@ -1,4 +1,5 @@ import { TODO, VERIFY, VERIFY_NOT_REACHED } from '../util/assertions.js'; +import { CodePoints } from '../util/codePoints.js'; import { Constructor } from '../util/guards.js'; import { ParseError } from './errors.js'; import { Attribute } from './tokenizer/attribute.js'; @@ -124,7 +125,7 @@ export class Tokenizer { break; default: { // ASCII alpha - if (this.asciiAlpha(this.currentInputCharacter)) { + if (CodePoints.ASCIIiAlpha(this.currentInputCharacter)) { // Create a new start tag token, set its tag name to the empty string. this.create(StartTagToken.createEmpty().startingAt(this.currentPosition)); @@ -175,7 +176,7 @@ export class Tokenizer { break; default: { // ASCII alpha - if (this.asciiAlpha(this.currentInputCharacter)) { + if (CodePoints.ASCIIiAlpha(this.currentInputCharacter)) { // Create a new end tag token, set its tag name to the empty string. this.create(EndTagToken.createEmpty().startingAt(this.currentPosition)); @@ -341,7 +342,7 @@ export class Tokenizer { break; default: { // ASCII upper alpha - if (this.asciiUpperAlpha(this.currentInputCharacter)) { + if (CodePoints.ASCIIUpperAlpha(this.currentInputCharacter)) { // Create a new DOCTYPE token. Set the token's name to the lowercase version of the current // input character (add 0x0020 to the character's code point). this.create(DOCTYPEToken.createWithName(this.currentInputCharacter.toLowerCase()).startingAt(this.currentPosition)); @@ -409,7 +410,7 @@ export class Tokenizer { break; default: { // ASCII upper alpha - if (this.asciiUpperAlpha(this.currentInputCharacter)) { + if (CodePoints.ASCIIUpperAlpha(this.currentInputCharacter)) { // Append the lowercase version of the current input character (add 0x0020 to the character's // code point) to the current DOCTYPE token's name. this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter.toLowerCase()); @@ -465,7 +466,7 @@ export class Tokenizer { this.emit(EndOfFileToken.create()); break; default: { // ASCII upper alpha - if (this.asciiUpperAlpha(this.currentInputCharacter)) { + if (CodePoints.ASCIIUpperAlpha(this.currentInputCharacter)) { // Append the lowercase version of the current input character (add 0x0020 to the character's // code point) to the current tag token's tag name. this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter.toLowerCase()); @@ -579,7 +580,7 @@ export class Tokenizer { break; default: { // ASCII upper alpha - if (this.asciiUpperAlpha(this.currentInputCharacter)) { + if (CodePoints.ASCIIUpperAlpha(this.currentInputCharacter)) { // Append the lowercase version of the current input character (add 0x0020 to the character's // code point) to the current attribute's name. this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter.toLowerCase()); @@ -1086,7 +1087,7 @@ export class Tokenizer { break; default: { // ASCII alphanumeric - if (this.asciiAlphanumeric(this.currentInputCharacter)) { + if (CodePoints.ASCIIAlphanumeric(this.currentInputCharacter)) { // Reconsume in the named character reference state. this.reconsumeIn(State.NamedCharacterReference); break; @@ -1123,7 +1124,7 @@ export class Tokenizer { // If the character reference was consumed as part of an attribute, and the last character matched // is not a U+003B SEMICOLON character (;), and the next input character is either a U+003D // EQUALS SIGN character (=) or an ASCII alphanumeric, then, for historical reasons, - if (this.consumedAsPartOfAnAttribute() && entry[entry.length - 1] !== '\u003B' && (this.next() === '\u003D' || this.asciiAlphanumeric(this.next() ?? ''))) { + if (this.consumedAsPartOfAnAttribute() && entry[entry.length - 1] !== '\u003B' && (this.next() === '\u003D' || CodePoints.ASCIIAlphanumeric(this.next() ?? ''))) { // flush code points consumed as a character reference and this.flushCodePointsConsumedAsCharacterReference(); @@ -1179,7 +1180,7 @@ export class Tokenizer { this.reconsumeIn(this.returnState); break; default: { // ASCII alphanumeric - if (this.asciiAlphanumeric(this.currentInputCharacter)) { + if (CodePoints.ASCIIAlphanumeric(this.currentInputCharacter)) { // If the character reference was consumed as part of an attribute, if (this.consumedAsPartOfAnAttribute()) { // then append the current input character to the current attribute's value. @@ -1221,26 +1222,6 @@ export class Tokenizer { return this.returnState === State.AttributeValueDouble || this.returnState === State.AttributeValueSingle || this.returnState === State.AttributeValueUnquoted; } - private asciiAlphanumeric(input: string): boolean { - return this.asciiAlpha(input) || this.asciiDigit(input); - } - - private asciiAlpha(input: string): boolean { - return this.asciiUpperAlpha(input) || this.asciiLowerAlpha(input); - } - - private asciiUpperAlpha(input: string): boolean { - return /[\u0041-\u005A]/.test(input); - } - - private asciiLowerAlpha(input: string): boolean { - return /[\u0061-\u007A]/.test(input); - } - - private asciiDigit(input: string): boolean { - return /[\u0030-\u0030]/.test(input); - } - private reconsumeIn(state: State): void { this.pointer--; diff --git a/src/util/codePoints.ts b/src/util/codePoints.ts new file mode 100644 index 0000000..902a491 --- /dev/null +++ b/src/util/codePoints.ts @@ -0,0 +1,45 @@ +// 4.5. Code points https://infra.spec.whatwg.org/#code-points +export class CodePoints { + // https://infra.spec.whatwg.org/#ascii-code-point + public static ASCIICodePoint(input: string): boolean { + // An ASCII code point is a code point in the range U+0000 NULL to U+007F DELETE, inclusive. + // eslint-disable-next-line no-control-regex + return /[\u0000-\u007F]/.test(input); + } + + // https://infra.spec.whatwg.org/#ascii-alphanumeric + public static ASCIIAlphanumeric(input: string): boolean { + // An ASCII alphanumeric is an ASCII digit or ASCII alpha. + return this.ASCIIiAlpha(input) || this.ASCIIDigit(input); + } + + // https://infra.spec.whatwg.org/#ascii-alpha + public static ASCIIiAlpha(input: string): boolean { + // An ASCII alpha is an ASCII upper alpha or ASCII lower alpha. + return this.ASCIIUpperAlpha(input) || this.ASCIILowerAlpha(input); + } + + // https://infra.spec.whatwg.org/#ascii-upper-alpha + public static ASCIIUpperAlpha(input: string): boolean { + // An ASCII upper alpha is a code point in the range U+0041 (A) to U+005A (Z), inclusive. + return /[\u0041-\u005A]/.test(input); + } + + // https://infra.spec.whatwg.org/#ascii-lower-alpha + public static ASCIILowerAlpha(input: string): boolean { + // An ASCII lower alpha is a code point in the range U+0061 (a) to U+007A (z), inclusive. + return /[\u0061-\u007A]/.test(input); + } + + // https://infra.spec.whatwg.org/#ascii-digit + public static ASCIIDigit(input: string): boolean { + // An ASCII digit is a code point in the range U+0030 (0) to U+0039 (9), inclusive. + return /[\u0030-\u0039]/.test(input); + } + + // https://infra.spec.whatwg.org/#ascii-string + public static ASCIIString(input: string): boolean { + // An ASCII string is a string whose code points are all ASCII code points. + return input.split('').every(codePoint => this.ASCIICodePoint(codePoint)); + } +}