Skip to content

Commit

Permalink
improve performance
Browse files Browse the repository at this point in the history
  • Loading branch information
ota-meshi committed Nov 8, 2023
1 parent 475b9c5 commit 095d8cb
Show file tree
Hide file tree
Showing 3 changed files with 320 additions and 349 deletions.
36 changes: 18 additions & 18 deletions src/tokenizer/code-point-iterator.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { NULL, EOF, LINE_FEED, CARRIAGE_RETURN } from "./code-point";
import { CodePoint } from "./code-point";

type Position = {
offset: number;
Expand All @@ -9,7 +9,7 @@ type Position = {
export class CodePointIterator {
public readonly text: string;

private lastCodePoint: number = NULL;
private lastCodePoint: number = CodePoint.NULL;

public start: Position = {
offset: -1,
Expand All @@ -31,31 +31,31 @@ export class CodePointIterator {
}

public next(): number {
if (this.lastCodePoint === EOF) {
return EOF;
if (this.lastCodePoint === CodePoint.EOF) {
return CodePoint.EOF;
}

this.start.offset = this.end.offset;
this.start.line = this.end.line;
this.start.column = this.end.column;

const cp = this.text.codePointAt(this.start.offset) ?? EOF;
if (cp === EOF) {
const cp = this.text.codePointAt(this.start.offset) ?? CodePoint.EOF;
if (cp === CodePoint.EOF) {
this.end = this.start;
return (this.lastCodePoint = cp);
}
const shift = cp >= 0x10000 ? 2 : 1;
this.end.offset = this.start.offset + shift;
if (cp === LINE_FEED) {
if (cp === CodePoint.LINE_FEED) {
this.end.line = this.start.line + 1;
this.end.column = 0;
} else if (cp === CARRIAGE_RETURN) {
if (this.text.codePointAt(this.end.offset) === LINE_FEED) {
} else if (cp === CodePoint.CARRIAGE_RETURN) {
if (this.text.codePointAt(this.end.offset) === CodePoint.LINE_FEED) {
this.end.offset++;
this.end.line = this.start.line + 1;
this.end.column = 0;
}
return (this.lastCodePoint = LINE_FEED);
return (this.lastCodePoint = CodePoint.LINE_FEED);
} else {
this.end.column = this.start.column + shift;
}
Expand All @@ -66,15 +66,15 @@ export class CodePointIterator {
public *iterateSubCodePoints(): IterableIterator<number> {
let index = this.end.offset;
while (true) {
let cp = this.text.codePointAt(index) ?? EOF;
if (cp === CARRIAGE_RETURN) {
if (this.text.codePointAt(index) === LINE_FEED) {
cp = this.text.codePointAt(++index) ?? EOF;
let cp = this.text.codePointAt(index) ?? CodePoint.EOF;
if (cp === CodePoint.CARRIAGE_RETURN) {
if (this.text.codePointAt(index) === CodePoint.LINE_FEED) {
cp = this.text.codePointAt(++index) ?? CodePoint.EOF;
} else {
cp = LINE_FEED;
cp = CodePoint.LINE_FEED;
}
}
if (cp === EOF) {
if (cp === CodePoint.EOF) {
return;
}
yield cp;
Expand All @@ -92,12 +92,12 @@ export class CodePointIterator {
return {
next() {
if (end) {
return EOF;
return CodePoint.EOF;
}
const r = sub.next();
if (r.done) {
end = true;
return EOF;
return CodePoint.EOF;
}
count++;
return r.value;
Expand Down
197 changes: 100 additions & 97 deletions src/tokenizer/code-point.ts
Original file line number Diff line number Diff line change
@@ -1,125 +1,128 @@
export const EOF = -1;
export const NULL = 0x00;
export const SOH = 0x01;
export const BACKSPACE = 0x08;
export const TABULATION = 0x09;
export const LINE_FEED = 0x0a;
export const FORM_FEED = 0x0c;
export const CARRIAGE_RETURN = 0x0d;
export const ESCAPE = 0x1b;
export const SO = 0x0e;
export const US = 0x1f;
export const SPACE = 0x20;
export const QUOTATION_MARK = 0x22;
export const HASH = 0x23;
export const SINGLE_QUOTE = 0x27;
export const PLUS_SIGN = 0x2b;
export const COMMA = 0x2c;
export const DASH = 0x2d;
export const DOT = 0x2e;
export const DIGIT_0 = 0x30;
export const DIGIT_1 = 0x31;
export const DIGIT_2 = 0x32;
export const DIGIT_3 = 0x33;
export const DIGIT_7 = 0x37;
export const DIGIT_9 = 0x39;
export const COLON = 0x3a;
export const EQUALS_SIGN = 0x3d;
export const LATIN_CAPITAL_A = 0x41;
export const LATIN_CAPITAL_E = 0x45;
export const LATIN_CAPITAL_F = 0x46;
export const LATIN_CAPITAL_T = 0x54;
export const LATIN_CAPITAL_U = 0x55;
export const LATIN_CAPITAL_Z = 0x5a;
export const LEFT_BRACKET = 0x5b; // [
export const BACKSLASH = 0x5c;
export const RIGHT_BRACKET = 0x5d; // ]
export const UNDERSCORE = 0x5f;
export const LATIN_SMALL_A = 0x61;
export const LATIN_SMALL_B = 0x62;
export const LATIN_SMALL_E = 0x65;
export const LATIN_SMALL_F = 0x66;
export const LATIN_SMALL_I = 0x69;
export const LATIN_SMALL_L = 0x6c;
export const LATIN_SMALL_N = 0x6e;
export const LATIN_SMALL_O = 0x6f;
export const LATIN_SMALL_R = 0x72;
export const LATIN_SMALL_S = 0x73;
export const LATIN_SMALL_T = 0x74;
export const LATIN_SMALL_U = 0x75;
export const LATIN_SMALL_X = 0x78;
export const LATIN_SMALL_Z = 0x7a;
export const LEFT_BRACE = 0x7b; // {
export const RIGHT_BRACE = 0x7d; // }
export const DELETE = 0x7f;
export const PAD = 0x80;
export const SUPERSCRIPT_TWO = 0xb2;
export const SUPERSCRIPT_THREE = 0xb3;
export const SUPERSCRIPT_ONE = 0xb9;
export const VULGAR_FRACTION_ONE_QUARTER = 0xbc;
export const VULGAR_FRACTION_THREE_QUARTERS = 0xbe;
export const LATIN_CAPITAL_LETTER_A_WITH_GRAVE = 0xc0;
export const LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS = 0xd6;
export const LATIN_CAPITAL_LETTER_O_WITH_STROKE = 0xd8;
export const LATIN_SMALL_LETTER_O_WITH_DIAERESIS = 0xf6;
export const LATIN_SMALL_LETTER_O_WITH_STROKE = 0xf8;
export const GREEK_SMALL_REVERSED_DOTTED_LUNATE_SIGMA_SYMBOL = 0x37b;
export const GREEK_CAPITAL_LETTER_YOT = 0x37f;
export const CP_1FFF = 0x1fff;
export const ZERO_WIDTH_NON_JOINER = 0x200c;
export const ZERO_WIDTH_JOINER = 0x200d;
export const UNDERTIE = 0x203f;
export const CHARACTER_TIE = 0x2040;
export const SUPERSCRIPT_ZERO = 0x2070;
export const CP_218F = 0x218f;
export const CIRCLED_DIGIT_ONE = 0x2460;
export const NEGATIVE_CIRCLED_DIGIT_ZERO = 0x24ff;
export const GLAGOLITIC_CAPITAL_LETTER_AZU = 0x2c00;
export const CP_2FEF = 0x2fef;
export const IDEOGRAPHIC_COMMA = 0x3001;
export const CP_D7FF = 0xd7ff;
export const CP_E000 = 0xe000;
export const CJK_COMPATIBILITY_IDEOGRAPH_F900 = 0xf900;
export const ARABIC_LIGATURE_SALAAMUHU_ALAYNAA = 0xfdcf;
export const ARABIC_LIGATURE_SALLA_USED_AS_KORANIC_STOP_SIGN_ISOLATED_FORM = 0xfdf0;
export const REPLACEMENT_CHARACTER = 0xfffd;
export const LINEAR_B_SYLLABLE_B008_A = 0x10000;
export const CP_EFFFF = 0xeffff;
export const CP_10FFFF = 0x10ffff;
// eslint-disable-next-line no-shadow -- bug?
export const enum CodePoint {
EOF = -1,
NULL = 0x00,
SOH = 0x01,
BACKSPACE = 0x08,
TABULATION = 0x09,
LINE_FEED = 0x0a,
FORM_FEED = 0x0c,
CARRIAGE_RETURN = 0x0d,
ESCAPE = 0x1b,
SO = 0x0e,
US = 0x1f,
SPACE = 0x20,
QUOTATION_MARK = 0x22,
HASH = 0x23,
SINGLE_QUOTE = 0x27,
PLUS_SIGN = 0x2b,
COMMA = 0x2c,
DASH = 0x2d,
DOT = 0x2e,
DIGIT_0 = 0x30,
DIGIT_1 = 0x31,
DIGIT_2 = 0x32,
DIGIT_3 = 0x33,
DIGIT_7 = 0x37,
DIGIT_9 = 0x39,
COLON = 0x3a,
EQUALS_SIGN = 0x3d,
LATIN_CAPITAL_A = 0x41,
LATIN_CAPITAL_E = 0x45,
LATIN_CAPITAL_F = 0x46,
LATIN_CAPITAL_T = 0x54,
LATIN_CAPITAL_U = 0x55,
LATIN_CAPITAL_Z = 0x5a,
LEFT_BRACKET = 0x5b, // [
BACKSLASH = 0x5c,
RIGHT_BRACKET = 0x5d, // ]
UNDERSCORE = 0x5f,
LATIN_SMALL_A = 0x61,
LATIN_SMALL_B = 0x62,
LATIN_SMALL_E = 0x65,
LATIN_SMALL_F = 0x66,
LATIN_SMALL_I = 0x69,
LATIN_SMALL_L = 0x6c,
LATIN_SMALL_N = 0x6e,
LATIN_SMALL_O = 0x6f,
LATIN_SMALL_R = 0x72,
LATIN_SMALL_S = 0x73,
LATIN_SMALL_T = 0x74,
LATIN_SMALL_U = 0x75,
LATIN_SMALL_X = 0x78,
LATIN_SMALL_Z = 0x7a,
LEFT_BRACE = 0x7b, // {
RIGHT_BRACE = 0x7d, // }
DELETE = 0x7f,
PAD = 0x80,
SUPERSCRIPT_TWO = 0xb2,
SUPERSCRIPT_THREE = 0xb3,
SUPERSCRIPT_ONE = 0xb9,
VULGAR_FRACTION_ONE_QUARTER = 0xbc,
VULGAR_FRACTION_THREE_QUARTERS = 0xbe,
LATIN_CAPITAL_LETTER_A_WITH_GRAVE = 0xc0,
LATIN_CAPITAL_LETTER_O_WITH_DIAERESIS = 0xd6,
LATIN_CAPITAL_LETTER_O_WITH_STROKE = 0xd8,
LATIN_SMALL_LETTER_O_WITH_DIAERESIS = 0xf6,
LATIN_SMALL_LETTER_O_WITH_STROKE = 0xf8,
GREEK_SMALL_REVERSED_DOTTED_LUNATE_SIGMA_SYMBOL = 0x37b,
GREEK_CAPITAL_LETTER_YOT = 0x37f,
CP_1FFF = 0x1fff,
ZERO_WIDTH_NON_JOINER = 0x200c,
ZERO_WIDTH_JOINER = 0x200d,
UNDERTIE = 0x203f,
CHARACTER_TIE = 0x2040,
SUPERSCRIPT_ZERO = 0x2070,
CP_218F = 0x218f,
CIRCLED_DIGIT_ONE = 0x2460,
NEGATIVE_CIRCLED_DIGIT_ZERO = 0x24ff,
GLAGOLITIC_CAPITAL_LETTER_AZU = 0x2c00,
CP_2FEF = 0x2fef,
IDEOGRAPHIC_COMMA = 0x3001,
CP_D7FF = 0xd7ff,
CP_E000 = 0xe000,
CJK_COMPATIBILITY_IDEOGRAPH_F900 = 0xf900,
ARABIC_LIGATURE_SALAAMUHU_ALAYNAA = 0xfdcf,
ARABIC_LIGATURE_SALLA_USED_AS_KORANIC_STOP_SIGN_ISOLATED_FORM = 0xfdf0,
REPLACEMENT_CHARACTER = 0xfffd,
LINEAR_B_SYLLABLE_B008_A = 0x10000,
CP_EFFFF = 0xeffff,
CP_10FFFF = 0x10ffff,
}

/**
* Check whether the code point is a control character.
*/
export function isControl(cp: number): boolean {
return cp >= NULL && cp <= US;
return cp >= CodePoint.NULL && cp <= CodePoint.US;
}

/**
* Check whether the code point is a whitespace.
*/
export function isWhitespace(cp: number): boolean {
return cp === TABULATION || cp === SPACE;
return cp === CodePoint.TABULATION || cp === CodePoint.SPACE;
}

/**
* Check whether the code point is a end of line.
*/
export function isEOL(cp: number): boolean {
return cp === LINE_FEED || cp === CARRIAGE_RETURN;
return cp === CodePoint.LINE_FEED || cp === CodePoint.CARRIAGE_RETURN;
}

/**
* Check whether the code point is an uppercase letter character.
*/
function isUpperLetter(cp: number): boolean {
return cp >= LATIN_CAPITAL_A && cp <= LATIN_CAPITAL_Z;
return cp >= CodePoint.LATIN_CAPITAL_A && cp <= CodePoint.LATIN_CAPITAL_Z;
}

/**
* Check whether the code point is a lowercase letter character.
*/
function isLowerLetter(cp: number): boolean {
return cp >= LATIN_SMALL_A && cp <= LATIN_SMALL_Z;
return cp >= CodePoint.LATIN_SMALL_A && cp <= CodePoint.LATIN_SMALL_Z;
}

/**
Expand All @@ -133,7 +136,7 @@ export function isLetter(cp: number): boolean {
* Check whether the code point is a digit character.
*/
export function isDigit(cp: number): boolean {
return cp >= DIGIT_0 && cp <= DIGIT_9;
return cp >= CodePoint.DIGIT_0 && cp <= CodePoint.DIGIT_9;
}

/**
Expand All @@ -142,15 +145,15 @@ export function isDigit(cp: number): boolean {
export function isHexDig(cp: number): boolean {
return (
isDigit(cp) ||
(cp >= LATIN_SMALL_A && cp <= LATIN_SMALL_F) ||
(cp >= LATIN_CAPITAL_A && cp <= LATIN_CAPITAL_F)
(cp >= CodePoint.LATIN_SMALL_A && cp <= CodePoint.LATIN_SMALL_F) ||
(cp >= CodePoint.LATIN_CAPITAL_A && cp <= CodePoint.LATIN_CAPITAL_F)
);
}
/**
* Check whether the code point is a octal digit character.
*/
export function isOctalDig(cp: number): boolean {
return cp >= DIGIT_0 && cp <= DIGIT_7;
return cp >= CodePoint.DIGIT_0 && cp <= CodePoint.DIGIT_7;
}

/**
Expand Down
Loading

0 comments on commit 095d8cb

Please sign in to comment.