HTML+Util: Move ASCII code point functions into seperate class

This commit is contained in:
networkException 2022-02-07 13:30:50 +01:00
parent b128e00ad4
commit 25145df31d
Signed by: networkException
GPG key ID: E3877443AE684391
2 changed files with 55 additions and 29 deletions

View file

@ -1,4 +1,5 @@
import { TODO, VERIFY, VERIFY_NOT_REACHED } from '../util/assertions.js';
import { CodePoints } from '../util/codePoints.js';
import { Constructor } from '../util/guards.js';
import { ParseError } from './errors.js';
import { Attribute } from './tokenizer/attribute.js';
@ -124,7 +125,7 @@ export class Tokenizer {
break;
default: {
// ASCII alpha
if (this.asciiAlpha(this.currentInputCharacter)) {
if (CodePoints.ASCIIiAlpha(this.currentInputCharacter)) {
// Create a new start tag token, set its tag name to the empty string.
this.create(StartTagToken.createEmpty().startingAt(this.currentPosition));
@ -175,7 +176,7 @@ export class Tokenizer {
break;
default: {
// ASCII alpha
if (this.asciiAlpha(this.currentInputCharacter)) {
if (CodePoints.ASCIIiAlpha(this.currentInputCharacter)) {
// Create a new end tag token, set its tag name to the empty string.
this.create(EndTagToken.createEmpty().startingAt(this.currentPosition));
@ -341,7 +342,7 @@ export class Tokenizer {
break;
default: {
// ASCII upper alpha
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
if (CodePoints.ASCIIUpperAlpha(this.currentInputCharacter)) {
// Create a new DOCTYPE token. Set the token's name to the lowercase version of the current
// input character (add 0x0020 to the character's code point).
this.create(DOCTYPEToken.createWithName(this.currentInputCharacter.toLowerCase()).startingAt(this.currentPosition));
@ -409,7 +410,7 @@ export class Tokenizer {
break;
default: {
// ASCII upper alpha
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
if (CodePoints.ASCIIUpperAlpha(this.currentInputCharacter)) {
// Append the lowercase version of the current input character (add 0x0020 to the character's
// code point) to the current DOCTYPE token's name.
this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter.toLowerCase());
@ -465,7 +466,7 @@ export class Tokenizer {
this.emit(EndOfFileToken.create()); break;
default: {
// ASCII upper alpha
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
if (CodePoints.ASCIIUpperAlpha(this.currentInputCharacter)) {
// Append the lowercase version of the current input character (add 0x0020 to the character's
// code point) to the current tag token's tag name.
this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter.toLowerCase());
@ -579,7 +580,7 @@ export class Tokenizer {
break;
default: {
// ASCII upper alpha
if (this.asciiUpperAlpha(this.currentInputCharacter)) {
if (CodePoints.ASCIIUpperAlpha(this.currentInputCharacter)) {
// Append the lowercase version of the current input character (add 0x0020 to the character's
// code point) to the current attribute's name.
this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter.toLowerCase());
@ -1086,7 +1087,7 @@ export class Tokenizer {
break;
default: {
// ASCII alphanumeric
if (this.asciiAlphanumeric(this.currentInputCharacter)) {
if (CodePoints.ASCIIAlphanumeric(this.currentInputCharacter)) {
// Reconsume in the named character reference state.
this.reconsumeIn(State.NamedCharacterReference);
break;
@ -1123,7 +1124,7 @@ export class Tokenizer {
// If the character reference was consumed as part of an attribute, and the last character matched
// is not a U+003B SEMICOLON character (;), and the next input character is either a U+003D
// EQUALS SIGN character (=) or an ASCII alphanumeric, then, for historical reasons,
if (this.consumedAsPartOfAnAttribute() && entry[entry.length - 1] !== '\u003B' && (this.next() === '\u003D' || this.asciiAlphanumeric(this.next() ?? ''))) {
if (this.consumedAsPartOfAnAttribute() && entry[entry.length - 1] !== '\u003B' && (this.next() === '\u003D' || CodePoints.ASCIIAlphanumeric(this.next() ?? ''))) {
// flush code points consumed as a character reference and
this.flushCodePointsConsumedAsCharacterReference();
@ -1179,7 +1180,7 @@ export class Tokenizer {
this.reconsumeIn(this.returnState); break;
default: {
// ASCII alphanumeric
if (this.asciiAlphanumeric(this.currentInputCharacter)) {
if (CodePoints.ASCIIAlphanumeric(this.currentInputCharacter)) {
// If the character reference was consumed as part of an attribute,
if (this.consumedAsPartOfAnAttribute()) {
// then append the current input character to the current attribute's value.
@ -1221,26 +1222,6 @@ export class Tokenizer {
return this.returnState === State.AttributeValueDouble || this.returnState === State.AttributeValueSingle || this.returnState === State.AttributeValueUnquoted;
}
private asciiAlphanumeric(input: string): boolean {
return this.asciiAlpha(input) || this.asciiDigit(input);
}
private asciiAlpha(input: string): boolean {
return this.asciiUpperAlpha(input) || this.asciiLowerAlpha(input);
}
private asciiUpperAlpha(input: string): boolean {
return /[\u0041-\u005A]/.test(input);
}
private asciiLowerAlpha(input: string): boolean {
return /[\u0061-\u007A]/.test(input);
}
private asciiDigit(input: string): boolean {
return /[\u0030-\u0030]/.test(input);
}
private reconsumeIn(state: State): void {
this.pointer--;

45
src/util/codePoints.ts Normal file
View file

@ -0,0 +1,45 @@
// 4.5. Code points https://infra.spec.whatwg.org/#code-points
export class CodePoints {
// https://infra.spec.whatwg.org/#ascii-code-point
public static ASCIICodePoint(input: string): boolean {
// An ASCII code point is a code point in the range U+0000 NULL to U+007F DELETE, inclusive.
// eslint-disable-next-line no-control-regex
return /[\u0000-\u007F]/.test(input);
}
// https://infra.spec.whatwg.org/#ascii-alphanumeric
public static ASCIIAlphanumeric(input: string): boolean {
// An ASCII alphanumeric is an ASCII digit or ASCII alpha.
return this.ASCIIiAlpha(input) || this.ASCIIDigit(input);
}
// https://infra.spec.whatwg.org/#ascii-alpha
public static ASCIIiAlpha(input: string): boolean {
// An ASCII alpha is an ASCII upper alpha or ASCII lower alpha.
return this.ASCIIUpperAlpha(input) || this.ASCIILowerAlpha(input);
}
// https://infra.spec.whatwg.org/#ascii-upper-alpha
public static ASCIIUpperAlpha(input: string): boolean {
// An ASCII upper alpha is a code point in the range U+0041 (A) to U+005A (Z), inclusive.
return /[\u0041-\u005A]/.test(input);
}
// https://infra.spec.whatwg.org/#ascii-lower-alpha
public static ASCIILowerAlpha(input: string): boolean {
// An ASCII lower alpha is a code point in the range U+0061 (a) to U+007A (z), inclusive.
return /[\u0061-\u007A]/.test(input);
}
// https://infra.spec.whatwg.org/#ascii-digit
public static ASCIIDigit(input: string): boolean {
// An ASCII digit is a code point in the range U+0030 (0) to U+0039 (9), inclusive.
return /[\u0030-\u0039]/.test(input);
}
// https://infra.spec.whatwg.org/#ascii-string
public static ASCIIString(input: string): boolean {
// An ASCII string is a string whose code points are all ASCII code points.
return input.split('').every(codePoint => this.ASCIICodePoint(codePoint));
}
}