From e1746f8ed5ec7aff5024913ffdda94ca1db9a64c Mon Sep 17 00:00:00 2001
From: Mark Sujew <mark.sujew@typefox.io>
Date: Wed, 30 Oct 2024 14:16:01 +0100
Subject: [PATCH] feat: optimize unoptimizable token types

---
 packages/chevrotain/src/scan/lexer.ts        |  73 +++++++++++---
 packages/chevrotain/src/scan/lexer_public.ts |  30 ++++--
 packages/chevrotain/test/scan/lexer_spec.ts  | 101 +++++++++++++++++++
 3 files changed, 178 insertions(+), 26 deletions(-)

diff --git a/packages/chevrotain/src/scan/lexer.ts b/packages/chevrotain/src/scan/lexer.ts
index 35bac099d..5d0035e42 100644
--- a/packages/chevrotain/src/scan/lexer.ts
+++ b/packages/chevrotain/src/scan/lexer.ts
@@ -66,6 +66,7 @@ export interface IAnalyzeResult {
   emptyGroups: { [groupName: string]: IToken[] };
   hasCustom: boolean;
   canBeOptimized: boolean;
+  unoptimizedPatterns: IPatternConfig[];
 }
 
 export let SUPPORT_STICKY =
@@ -306,6 +307,7 @@ export function analyzeTokenTypes(
   });
 
   let canBeOptimized = true;
+  let unoptimizedPatterns: IPatternConfig[] = [];
   let charCodeToPatternIdxToConfig: { [charCode: number]: IPatternConfig[] } =
     [];
 
@@ -317,7 +319,12 @@ export function analyzeTokenTypes(
           if (typeof currTokType.PATTERN === "string") {
             const charCode = currTokType.PATTERN.charCodeAt(0);
             const optimizedIdx = charCodeToOptimizedIndex(charCode);
-            addToMapOfArrays(result, optimizedIdx, patternIdxToConfig[idx]);
+            addToMapOfArrays(
+              result,
+              optimizedIdx,
+              patternIdxToConfig[idx],
+              unoptimizedPatterns,
+            );
           } else if (isArray(currTokType.START_CHARS_HINT)) {
             let lastOptimizedIdx: number;
             forEach(currTokType.START_CHARS_HINT, (charOrInt) => {
@@ -336,21 +343,31 @@ export function analyzeTokenTypes(
                   result,
                   currOptimizedIdx,
                   patternIdxToConfig[idx],
+                  unoptimizedPatterns,
                 );
               }
             });
           } else if (isRegExp(currTokType.PATTERN)) {
             if (currTokType.PATTERN.unicode) {
-              canBeOptimized = false;
+              forEach(Object.keys(result), (code) => {
+                addToMapOfArrays(
+                  result,
+                  Number(code),
+                  patternIdxToConfig[idx],
+                  unoptimizedPatterns,
+                );
+              });
+              unoptimizedPatterns.push(patternIdxToConfig[idx]);
               if (options.ensureOptimizations) {
                 PRINT_ERROR(
                   `${failedOptimizationPrefixMsg}` +
                     `\tUnable to analyze < ${currTokType.PATTERN.toString()} > pattern.\n` +
                     "\tThe regexp unicode flag is not currently supported by the regexp-to-ast library.\n" +
-                    "\tThis will disable the lexer's first char optimizations.\n" +
+                    "\tThis reduces lexer performance.\n" +
                     "\tFor details See: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#UNICODE_OPTIMIZE",
                 );
               }
+              canBeOptimized = false;
             } else {
               const optimizedCodes = getOptimizedStartCodesIndices(
                 currTokType.PATTERN,
@@ -358,27 +375,49 @@ export function analyzeTokenTypes(
               );
               /* istanbul ignore if */
               // start code will only be empty given an empty regExp or failure of regexp-to-ast library
-              // the first should be a different validation and the second cannot be tested.
               if (isEmpty(optimizedCodes)) {
                 // we cannot understand what codes may start possible matches
-                // The optimization correctness requires knowing start codes for ALL patterns.
-                // Not actually sure this is an error, no debug message
+                // instead, simply add the token to all known start characters
+                forEach(Object.keys(result), (code) => {
+                  addToMapOfArrays(
+                    result,
+                    Number(code),
+                    patternIdxToConfig[idx],
+                    unoptimizedPatterns,
+                  );
+                });
+                unoptimizedPatterns.push(patternIdxToConfig[idx]);
                 canBeOptimized = false;
+              } else {
+                forEach(optimizedCodes, (code) => {
+                  addToMapOfArrays(
+                    result,
+                    code,
+                    patternIdxToConfig[idx],
+                    unoptimizedPatterns,
+                  );
+                });
               }
-              forEach(optimizedCodes, (code) => {
-                addToMapOfArrays(result, code, patternIdxToConfig[idx]);
-              });
             }
           } else {
             if (options.ensureOptimizations) {
               PRINT_ERROR(
                 `${failedOptimizationPrefixMsg}` +
                   `\tTokenType: <${currTokType.name}> is using a custom token pattern without providing <start_chars_hint> parameter.\n` +
-                  "\tThis will disable the lexer's first char optimizations.\n" +
+                  "\tThis reduces lexer performance.\n" +
                   "\tFor details See: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#CUSTOM_OPTIMIZE",
               );
             }
             canBeOptimized = false;
+            forEach(Object.keys(result), (code) => {
+              addToMapOfArrays(
+                result,
+                Number(code),
+                patternIdxToConfig[idx],
+                unoptimizedPatterns,
+              );
+            });
+            unoptimizedPatterns.push(patternIdxToConfig[idx]);
           }
 
           return result;
@@ -389,11 +428,12 @@ export function analyzeTokenTypes(
   }
 
   return {
-    emptyGroups: emptyGroups,
-    patternIdxToConfig: patternIdxToConfig,
-    charCodeToPatternIdxToConfig: charCodeToPatternIdxToConfig,
-    hasCustom: hasCustom,
-    canBeOptimized: canBeOptimized,
+    emptyGroups,
+    patternIdxToConfig,
+    charCodeToPatternIdxToConfig,
+    hasCustom,
+    canBeOptimized,
+    unoptimizedPatterns,
   };
 }
 
@@ -1125,9 +1165,10 @@ function addToMapOfArrays<T>(
   map: Record<number, T[]>,
   key: number,
   value: T,
+  initial: T[],
 ): void {
   if (map[key] === undefined) {
-    map[key] = [value];
+    map[key] = [...initial, value];
   } else {
     map[key].push(value);
   }
diff --git a/packages/chevrotain/src/scan/lexer_public.ts b/packages/chevrotain/src/scan/lexer_public.ts
index 40521ab45..633303b7b 100644
--- a/packages/chevrotain/src/scan/lexer_public.ts
+++ b/packages/chevrotain/src/scan/lexer_public.ts
@@ -97,6 +97,7 @@ export class Lexer {
   public lexerDefinitionWarning: ILexerDefinitionError[] = [];
 
   protected patternIdxToConfig: Record<string, IPatternConfig[]> = {};
+  protected unoptimizedPatterns: Record<string, IPatternConfig[]> = {};
   protected charCodeToPatternIdxToConfig: {
     [modeName: string]: { [charCode: number]: IPatternConfig[] };
   } = {};
@@ -261,6 +262,9 @@ export class Lexer {
               this.charCodeToPatternIdxToConfig[currModName] =
                 currAnalyzeResult.charCodeToPatternIdxToConfig;
 
+              this.unoptimizedPatterns[currModName] =
+                currAnalyzeResult.unoptimizedPatterns;
+
               this.emptyGroups = assign(
                 {},
                 this.emptyGroups,
@@ -344,6 +348,11 @@ export class Lexer {
       });
 
       this.TRACE_INIT("Failed Optimization Warnings", () => {
+        if (config.ensureOptimizations !== true) {
+          // Return early
+          return;
+        }
+
         const unOptimizedModes = reduce(
           this.canModeBeOptimized,
           (cannotBeOptimized, canBeOptimized, modeName) => {
@@ -355,7 +364,7 @@ export class Lexer {
           [] as string[],
         );
 
-        if (config.ensureOptimizations && !isEmpty(unOptimizedModes)) {
+        if (!isEmpty(unOptimizedModes)) {
           throw Error(
             `Lexer Modes: < ${unOptimizedModes.join(
               ", ",
@@ -438,14 +447,13 @@ export class Lexer {
 
     let currModePatternsLength = 0;
     let patternIdxToConfig: IPatternConfig[] = [];
+    let unoptimizedPatterns: IPatternConfig[] = [];
     let currCharCodeToPatternIdxToConfig: {
       [charCode: number]: IPatternConfig[];
     } = [];
 
     const modeStack: string[] = [];
 
-    const emptyArray: IPatternConfig[] = [];
-    Object.freeze(emptyArray);
     let getPossiblePatterns!: (charCode: number) => IPatternConfig[];
 
     function getPossiblePatternsSlow() {
@@ -457,7 +465,7 @@ export class Lexer {
       const possiblePatterns =
         currCharCodeToPatternIdxToConfig[optimizedCharIdx];
       if (possiblePatterns === undefined) {
-        return emptyArray;
+        return unoptimizedPatterns;
       } else {
         return possiblePatterns;
       }
@@ -492,10 +500,12 @@ export class Lexer {
         currCharCodeToPatternIdxToConfig =
           this.charCodeToPatternIdxToConfig[newMode];
         currModePatternsLength = patternIdxToConfig.length;
-        const modeCanBeOptimized =
-          this.canModeBeOptimized[newMode] && this.config.safeMode === false;
+        unoptimizedPatterns = this.unoptimizedPatterns[newMode];
 
-        if (currCharCodeToPatternIdxToConfig && modeCanBeOptimized) {
+        if (
+          currCharCodeToPatternIdxToConfig &&
+          this.config.safeMode === false
+        ) {
           getPossiblePatterns = getPossiblePatternsOptimized;
         } else {
           getPossiblePatterns = getPossiblePatternsSlow;
@@ -508,14 +518,14 @@ export class Lexer {
       currCharCodeToPatternIdxToConfig =
         this.charCodeToPatternIdxToConfig[newMode];
 
+      unoptimizedPatterns = this.unoptimizedPatterns[newMode];
+
       patternIdxToConfig = this.patternIdxToConfig[newMode];
       currModePatternsLength = patternIdxToConfig.length;
 
       currModePatternsLength = patternIdxToConfig.length;
-      const modeCanBeOptimized =
-        this.canModeBeOptimized[newMode] && this.config.safeMode === false;
 
-      if (currCharCodeToPatternIdxToConfig && modeCanBeOptimized) {
+      if (currCharCodeToPatternIdxToConfig && this.config.safeMode === false) {
         getPossiblePatterns = getPossiblePatternsOptimized;
       } else {
         getPossiblePatterns = getPossiblePatternsSlow;
diff --git a/packages/chevrotain/test/scan/lexer_spec.ts b/packages/chevrotain/test/scan/lexer_spec.ts
index e5d4b0b73..d4d1e5cf8 100644
--- a/packages/chevrotain/test/scan/lexer_spec.ts
+++ b/packages/chevrotain/test/scan/lexer_spec.ts
@@ -2236,6 +2236,7 @@ describe("debugging and messages and optimizations", () => {
     });
     expect((<any>alphaLexerSafeMode).charCodeToPatternIdxToConfig.defaultMode)
       .to.be.empty;
+    const safeModeResult = alphaLexerSafeMode.tokenize("a");
 
     // compare to safeMode disabled
     const alphaLexerNoSafeMode = new Lexer([Alpha], {
@@ -2245,6 +2246,106 @@ describe("debugging and messages and optimizations", () => {
       (<any>alphaLexerNoSafeMode).charCodeToPatternIdxToConfig
         .defaultMode[97][0].tokenType,
     ).to.equal(Alpha);
+    const noSafeModeResult = alphaLexerNoSafeMode.tokenize("a");
+    expect(safeModeResult).to.deep.equal(noSafeModeResult);
+  });
+
+  it("won't optimize with safe mode enabled - multi mode lexer", () => {
+    const Alpha = createToken({
+      name: "A",
+      pattern: /a/,
+      push_mode: "b",
+    });
+    const Beta = createToken({
+      name: "B",
+      pattern: /b/,
+      pop_mode: true,
+    });
+    const tokens = {
+      modes: {
+        a: [Alpha],
+        b: [Beta],
+      },
+      defaultMode: "a",
+    };
+    const text = "abab";
+    const lexerSafeMode = new Lexer(tokens, {
+      positionTracking: "onlyOffset",
+      safeMode: true,
+    });
+    expect((<any>lexerSafeMode).charCodeToPatternIdxToConfig.a).to.be.empty;
+    const safeModeResult = lexerSafeMode.tokenize(text);
+
+    // compare to safeMode disabled
+    const lexerNoSafeMode = new Lexer(tokens, {
+      positionTracking: "onlyOffset",
+    });
+    expect(
+      (<any>lexerNoSafeMode).charCodeToPatternIdxToConfig.a[97][0].tokenType,
+    ).to.equal(Alpha);
+    const noSafeModeResult = lexerNoSafeMode.tokenize(text);
+    expect(safeModeResult).to.deep.equal(noSafeModeResult);
+  });
+
+  context("lexer optimization", () => {
+    const dFunction = (text: string, offset: number) => {
+      if (text.charAt(offset) === "d") {
+        return ["d"] as [string];
+      } else {
+        return null;
+      }
+    };
+
+    for (const [name, pattern] of [
+      ["function", dFunction],
+      ["unicode regexp", /d/u],
+      ["lookbehind regexp", /(?<!a)d/],
+    ]) {
+      it(`will optimize ${name} pattern`, () => {
+        const Alpha = createToken({
+          name: "A",
+          pattern: "a",
+        });
+        const Beta = createToken({
+          name: "B",
+          pattern: "b",
+        });
+        const Delta = createToken({
+          name: "D",
+          pattern,
+        });
+        const optimizedLexer = new Lexer([Alpha, Delta, Beta], {
+          positionTracking: "onlyOffset",
+        });
+        // Assert that the pattern will be added to all character codes
+        // Also assert that the ordering gets preserved
+        expect(
+          (<any>optimizedLexer).charCodeToPatternIdxToConfig.defaultMode[
+            "a".charCodeAt(0)
+          ].map((e: any) => e.tokenType),
+        ).to.deep.equal([Alpha, Delta]);
+        expect(
+          (<any>optimizedLexer).charCodeToPatternIdxToConfig.defaultMode[
+            "b".charCodeAt(0)
+          ].map((e: any) => e.tokenType),
+        ).to.deep.equal([Delta, Beta]);
+        // The lexer cannot identify that the pattern is only for the character 'd'
+        expect(
+          (<any>optimizedLexer).charCodeToPatternIdxToConfig.defaultMode[
+            "d".charCodeAt(0)
+          ],
+        ).to.be.undefined;
+        expect(optimizedLexer.tokenize("a").tokens[0].tokenType).to.deep.equal(
+          Alpha,
+        );
+        expect(optimizedLexer.tokenize("b").tokens[0].tokenType).to.deep.equal(
+          Beta,
+        );
+        expect(optimizedLexer.tokenize("d").tokens[0].tokenType).to.deep.equal(
+          Delta,
+        );
+      });
+    }
   });
 });