HTML+Util: Move ASCII code point functions into seperate class

2022-02-07 13:30:50 +01:00 · 2022-02-07 13:30:50 +01:00 · 25145df31d
commit 25145df31d
parent b128e00ad4
2 changed files with 55 additions and 29 deletions
--- a/src/html/tokenizer.ts
+++ b/src/html/tokenizer.ts
@ -1,4 +1,5 @@
 import { TODO, VERIFY, VERIFY_NOT_REACHED } from '../util/assertions.js';
+import { CodePoints } from '../util/codePoints.js';
 import { Constructor } from '../util/guards.js';
 import { ParseError } from './errors.js';
 import { Attribute } from './tokenizer/attribute.js';
@ -124,7 +125,7 @@ export class Tokenizer {
                        break;
                    default: {
                        // ASCII alpha
-                        if (this.asciiAlpha(this.currentInputCharacter)) {
+                        if (CodePoints.ASCIIiAlpha(this.currentInputCharacter)) {
                            // Create a new start tag token, set its tag name to the empty string.
                            this.create(StartTagToken.createEmpty().startingAt(this.currentPosition));

@ -175,7 +176,7 @@ export class Tokenizer {
                        break;
                    default: {
                        // ASCII alpha
-                        if (this.asciiAlpha(this.currentInputCharacter)) {
+                        if (CodePoints.ASCIIiAlpha(this.currentInputCharacter)) {
                            // Create a new end tag token, set its tag name to the empty string.
                            this.create(EndTagToken.createEmpty().startingAt(this.currentPosition));

@ -341,7 +342,7 @@ export class Tokenizer {
                        break;
                    default: {
                        // ASCII upper alpha
-                        if (this.asciiUpperAlpha(this.currentInputCharacter)) {
+                        if (CodePoints.ASCIIUpperAlpha(this.currentInputCharacter)) {
                            // Create a new DOCTYPE token. Set the token's name to the lowercase version of the current
                            // input character (add 0x0020 to the character's code point).
                            this.create(DOCTYPEToken.createWithName(this.currentInputCharacter.toLowerCase()).startingAt(this.currentPosition));
@ -409,7 +410,7 @@ export class Tokenizer {
                        break;
                    default: {
                        // ASCII upper alpha
-                        if (this.asciiUpperAlpha(this.currentInputCharacter)) {
+                        if (CodePoints.ASCIIUpperAlpha(this.currentInputCharacter)) {
                            // Append the lowercase version of the current input character (add 0x0020 to the character's
                            // code point) to the current DOCTYPE token's name.
                            this.currentOfType(DOCTYPEToken).appendToName(this.currentInputCharacter.toLowerCase());
@ -465,7 +466,7 @@ export class Tokenizer {
                        this.emit(EndOfFileToken.create()); break;
                    default: {
                        // ASCII upper alpha
-                        if (this.asciiUpperAlpha(this.currentInputCharacter)) {
+                        if (CodePoints.ASCIIUpperAlpha(this.currentInputCharacter)) {
                            // Append the lowercase version of the current input character (add 0x0020 to the character's
                            // code point) to the current tag token's tag name.
                            this.currentOfEitherType(StartTagToken, EndTagToken).appendToName(this.currentInputCharacter.toLowerCase());
@ -579,7 +580,7 @@ export class Tokenizer {
                        break;
                    default: {
                        // ASCII upper alpha
-                        if (this.asciiUpperAlpha(this.currentInputCharacter)) {
+                        if (CodePoints.ASCIIUpperAlpha(this.currentInputCharacter)) {
                            // Append the lowercase version of the current input character (add 0x0020 to the character's
                            // code point) to the current attribute's name.
                            this.currentOfEitherType(StartTagToken, EndTagToken).attributes.current.appendToName(this.currentInputCharacter.toLowerCase());
@ -1086,7 +1087,7 @@ export class Tokenizer {
                        break;
                    default: {
                        // ASCII alphanumeric
-                        if (this.asciiAlphanumeric(this.currentInputCharacter)) {
+                        if (CodePoints.ASCIIAlphanumeric(this.currentInputCharacter)) {
                            // Reconsume in the named character reference state.
                            this.reconsumeIn(State.NamedCharacterReference);
                            break;
@ -1123,7 +1124,7 @@ export class Tokenizer {
                        // If the character reference was consumed as part of an attribute, and the last character matched
                        // is not a U+003B SEMICOLON character (;), and the next input character is either a U+003D
                        // EQUALS SIGN character (=) or an ASCII alphanumeric, then, for historical reasons,
-                        if (this.consumedAsPartOfAnAttribute() && entry[entry.length - 1] !== '\u003B' && (this.next() === '\u003D' || this.asciiAlphanumeric(this.next() ?? ''))) {
+                        if (this.consumedAsPartOfAnAttribute() && entry[entry.length - 1] !== '\u003B' && (this.next() === '\u003D' || CodePoints.ASCIIAlphanumeric(this.next() ?? ''))) {
                            // flush code points consumed as a character reference and
                            this.flushCodePointsConsumedAsCharacterReference();

@ -1179,7 +1180,7 @@ export class Tokenizer {
                        this.reconsumeIn(this.returnState); break;
                    default: {
                        // ASCII alphanumeric
-                        if (this.asciiAlphanumeric(this.currentInputCharacter)) {
+                        if (CodePoints.ASCIIAlphanumeric(this.currentInputCharacter)) {
                            // If the character reference was consumed as part of an attribute,
                            if (this.consumedAsPartOfAnAttribute()) {
                                // then append the current input character to the current attribute's value.
@ -1221,26 +1222,6 @@ export class Tokenizer {
        return this.returnState === State.AttributeValueDouble || this.returnState === State.AttributeValueSingle || this.returnState === State.AttributeValueUnquoted;
    }

-    private asciiAlphanumeric(input: string): boolean {
-        return this.asciiAlpha(input) || this.asciiDigit(input);
-    }
-
-    private asciiAlpha(input: string): boolean {
-        return this.asciiUpperAlpha(input) || this.asciiLowerAlpha(input);
-    }
-
-    private asciiUpperAlpha(input: string): boolean {
-        return /[\u0041-\u005A]/.test(input);
-    }
-
-    private asciiLowerAlpha(input: string): boolean {
-        return /[\u0061-\u007A]/.test(input);
-    }
-
-    private asciiDigit(input: string): boolean {
-        return /[\u0030-\u0030]/.test(input);
-    }
-
    private reconsumeIn(state: State): void {
        this.pointer--;

--- a/src/util/codePoints.ts
+++ b/src/util/codePoints.ts
@ -0,0 +1,45 @@
+// 4.5. Code points https://infra.spec.whatwg.org/#code-points
+export class CodePoints {
+    // https://infra.spec.whatwg.org/#ascii-code-point
+    public static ASCIICodePoint(input: string): boolean {
+        // An ASCII code point is a code point in the range U+0000 NULL to U+007F DELETE, inclusive.
+        // eslint-disable-next-line no-control-regex
+        return /[\u0000-\u007F]/.test(input);
+    }
+
+    // https://infra.spec.whatwg.org/#ascii-alphanumeric
+    public static ASCIIAlphanumeric(input: string): boolean {
+        // An ASCII alphanumeric is an ASCII digit or ASCII alpha.
+        return this.ASCIIiAlpha(input) || this.ASCIIDigit(input);
+    }
+
+    // https://infra.spec.whatwg.org/#ascii-alpha
+    public static ASCIIiAlpha(input: string): boolean {
+        // An ASCII alpha is an ASCII upper alpha or ASCII lower alpha.
+        return this.ASCIIUpperAlpha(input) || this.ASCIILowerAlpha(input);
+    }
+
+    // https://infra.spec.whatwg.org/#ascii-upper-alpha
+    public static ASCIIUpperAlpha(input: string): boolean {
+        // An ASCII upper alpha is a code point in the range U+0041 (A) to U+005A (Z), inclusive.
+        return /[\u0041-\u005A]/.test(input);
+    }
+
+    // https://infra.spec.whatwg.org/#ascii-lower-alpha
+    public static ASCIILowerAlpha(input: string): boolean {
+        // An ASCII lower alpha is a code point in the range U+0061 (a) to U+007A (z), inclusive.
+        return /[\u0061-\u007A]/.test(input);
+    }
+
+    // https://infra.spec.whatwg.org/#ascii-digit
+    public static ASCIIDigit(input: string): boolean {
+        // An ASCII digit is a code point in the range U+0030 (0) to U+0039 (9), inclusive.
+        return /[\u0030-\u0039]/.test(input);
+    }
+
+    // https://infra.spec.whatwg.org/#ascii-string
+    public static ASCIIString(input: string): boolean {
+        // An ASCII string is a string whose code points are all ASCII code points.
+        return input.split('').every(codePoint => this.ASCIICodePoint(codePoint));
+    }
+}