From e1746f8ed5ec7aff5024913ffdda94ca1db9a64c Mon Sep 17 00:00:00 2001 From: Mark Sujew Date: Wed, 30 Oct 2024 14:16:01 +0100 Subject: [PATCH] feat: optimize unoptimizable token types --- packages/chevrotain/src/scan/lexer.ts | 73 +++++++++++--- packages/chevrotain/src/scan/lexer_public.ts | 30 ++++-- packages/chevrotain/test/scan/lexer_spec.ts | 101 +++++++++++++++++++ 3 files changed, 178 insertions(+), 26 deletions(-) diff --git a/packages/chevrotain/src/scan/lexer.ts b/packages/chevrotain/src/scan/lexer.ts index 35bac099d..5d0035e42 100644 --- a/packages/chevrotain/src/scan/lexer.ts +++ b/packages/chevrotain/src/scan/lexer.ts @@ -66,6 +66,7 @@ export interface IAnalyzeResult { emptyGroups: { [groupName: string]: IToken[] }; hasCustom: boolean; canBeOptimized: boolean; + unoptimizedPatterns: IPatternConfig[]; } export let SUPPORT_STICKY = @@ -306,6 +307,7 @@ export function analyzeTokenTypes( }); let canBeOptimized = true; + let unoptimizedPatterns: IPatternConfig[] = []; let charCodeToPatternIdxToConfig: { [charCode: number]: IPatternConfig[] } = []; @@ -317,7 +319,12 @@ export function analyzeTokenTypes( if (typeof currTokType.PATTERN === "string") { const charCode = currTokType.PATTERN.charCodeAt(0); const optimizedIdx = charCodeToOptimizedIndex(charCode); - addToMapOfArrays(result, optimizedIdx, patternIdxToConfig[idx]); + addToMapOfArrays( + result, + optimizedIdx, + patternIdxToConfig[idx], + unoptimizedPatterns, + ); } else if (isArray(currTokType.START_CHARS_HINT)) { let lastOptimizedIdx: number; forEach(currTokType.START_CHARS_HINT, (charOrInt) => { @@ -336,21 +343,31 @@ export function analyzeTokenTypes( result, currOptimizedIdx, patternIdxToConfig[idx], + unoptimizedPatterns, ); } }); } else if (isRegExp(currTokType.PATTERN)) { if (currTokType.PATTERN.unicode) { - canBeOptimized = false; + forEach(Object.keys(result), (code) => { + addToMapOfArrays( + result, + Number(code), + patternIdxToConfig[idx], + unoptimizedPatterns, + ); + }); + unoptimizedPatterns.push(patternIdxToConfig[idx]); if (options.ensureOptimizations) { PRINT_ERROR( `${failedOptimizationPrefixMsg}` + `\tUnable to analyze < ${currTokType.PATTERN.toString()} > pattern.\n` + "\tThe regexp unicode flag is not currently supported by the regexp-to-ast library.\n" + - "\tThis will disable the lexer's first char optimizations.\n" + + "\tThis reduces lexer performance.\n" + "\tFor details See: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#UNICODE_OPTIMIZE", ); } + canBeOptimized = false; } else { const optimizedCodes = getOptimizedStartCodesIndices( currTokType.PATTERN, @@ -358,27 +375,49 @@ export function analyzeTokenTypes( ); /* istanbul ignore if */ // start code will only be empty given an empty regExp or failure of regexp-to-ast library - // the first should be a different validation and the second cannot be tested. if (isEmpty(optimizedCodes)) { // we cannot understand what codes may start possible matches - // The optimization correctness requires knowing start codes for ALL patterns. - // Not actually sure this is an error, no debug message + // instead, simply add the token to all known start characters + forEach(Object.keys(result), (code) => { + addToMapOfArrays( + result, + Number(code), + patternIdxToConfig[idx], + unoptimizedPatterns, + ); + }); + unoptimizedPatterns.push(patternIdxToConfig[idx]); canBeOptimized = false; + } else { + forEach(optimizedCodes, (code) => { + addToMapOfArrays( + result, + code, + patternIdxToConfig[idx], + unoptimizedPatterns, + ); + }); } - forEach(optimizedCodes, (code) => { - addToMapOfArrays(result, code, patternIdxToConfig[idx]); - }); } } else { if (options.ensureOptimizations) { PRINT_ERROR( `${failedOptimizationPrefixMsg}` + `\tTokenType: <${currTokType.name}> is using a custom token pattern without providing parameter.\n` + - "\tThis will disable the lexer's first char optimizations.\n" + + "\tThis reduces lexer performance.\n" + "\tFor details See: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#CUSTOM_OPTIMIZE", ); } canBeOptimized = false; + forEach(Object.keys(result), (code) => { + addToMapOfArrays( + result, + Number(code), + patternIdxToConfig[idx], + unoptimizedPatterns, + ); + }); + unoptimizedPatterns.push(patternIdxToConfig[idx]); } return result; @@ -389,11 +428,12 @@ export function analyzeTokenTypes( } return { - emptyGroups: emptyGroups, - patternIdxToConfig: patternIdxToConfig, - charCodeToPatternIdxToConfig: charCodeToPatternIdxToConfig, - hasCustom: hasCustom, - canBeOptimized: canBeOptimized, + emptyGroups, + patternIdxToConfig, + charCodeToPatternIdxToConfig, + hasCustom, + canBeOptimized, + unoptimizedPatterns, }; } @@ -1125,9 +1165,10 @@ function addToMapOfArrays( map: Record, key: number, value: T, + initial: T[], ): void { if (map[key] === undefined) { - map[key] = [value]; + map[key] = [...initial, value]; } else { map[key].push(value); } diff --git a/packages/chevrotain/src/scan/lexer_public.ts b/packages/chevrotain/src/scan/lexer_public.ts index 40521ab45..633303b7b 100644 --- a/packages/chevrotain/src/scan/lexer_public.ts +++ b/packages/chevrotain/src/scan/lexer_public.ts @@ -97,6 +97,7 @@ export class Lexer { public lexerDefinitionWarning: ILexerDefinitionError[] = []; protected patternIdxToConfig: Record = {}; + protected unoptimizedPatterns: Record = {}; protected charCodeToPatternIdxToConfig: { [modeName: string]: { [charCode: number]: IPatternConfig[] }; } = {}; @@ -261,6 +262,9 @@ export class Lexer { this.charCodeToPatternIdxToConfig[currModName] = currAnalyzeResult.charCodeToPatternIdxToConfig; + this.unoptimizedPatterns[currModName] = + currAnalyzeResult.unoptimizedPatterns; + this.emptyGroups = assign( {}, this.emptyGroups, @@ -344,6 +348,11 @@ export class Lexer { }); this.TRACE_INIT("Failed Optimization Warnings", () => { + if (config.ensureOptimizations !== true) { + // Return early + return; + } + const unOptimizedModes = reduce( this.canModeBeOptimized, (cannotBeOptimized, canBeOptimized, modeName) => { @@ -355,7 +364,7 @@ export class Lexer { [] as string[], ); - if (config.ensureOptimizations && !isEmpty(unOptimizedModes)) { + if (!isEmpty(unOptimizedModes)) { throw Error( `Lexer Modes: < ${unOptimizedModes.join( ", ", @@ -438,14 +447,13 @@ export class Lexer { let currModePatternsLength = 0; let patternIdxToConfig: IPatternConfig[] = []; + let unoptimizedPatterns: IPatternConfig[] = []; let currCharCodeToPatternIdxToConfig: { [charCode: number]: IPatternConfig[]; } = []; const modeStack: string[] = []; - const emptyArray: IPatternConfig[] = []; - Object.freeze(emptyArray); let getPossiblePatterns!: (charCode: number) => IPatternConfig[]; function getPossiblePatternsSlow() { @@ -457,7 +465,7 @@ export class Lexer { const possiblePatterns = currCharCodeToPatternIdxToConfig[optimizedCharIdx]; if (possiblePatterns === undefined) { - return emptyArray; + return unoptimizedPatterns; } else { return possiblePatterns; } @@ -492,10 +500,12 @@ export class Lexer { currCharCodeToPatternIdxToConfig = this.charCodeToPatternIdxToConfig[newMode]; currModePatternsLength = patternIdxToConfig.length; - const modeCanBeOptimized = - this.canModeBeOptimized[newMode] && this.config.safeMode === false; + unoptimizedPatterns = this.unoptimizedPatterns[newMode]; - if (currCharCodeToPatternIdxToConfig && modeCanBeOptimized) { + if ( + currCharCodeToPatternIdxToConfig && + this.config.safeMode === false + ) { getPossiblePatterns = getPossiblePatternsOptimized; } else { getPossiblePatterns = getPossiblePatternsSlow; @@ -508,14 +518,14 @@ export class Lexer { currCharCodeToPatternIdxToConfig = this.charCodeToPatternIdxToConfig[newMode]; + unoptimizedPatterns = this.unoptimizedPatterns[newMode]; + patternIdxToConfig = this.patternIdxToConfig[newMode]; currModePatternsLength = patternIdxToConfig.length; currModePatternsLength = patternIdxToConfig.length; - const modeCanBeOptimized = - this.canModeBeOptimized[newMode] && this.config.safeMode === false; - if (currCharCodeToPatternIdxToConfig && modeCanBeOptimized) { + if (currCharCodeToPatternIdxToConfig && this.config.safeMode === false) { getPossiblePatterns = getPossiblePatternsOptimized; } else { getPossiblePatterns = getPossiblePatternsSlow; diff --git a/packages/chevrotain/test/scan/lexer_spec.ts b/packages/chevrotain/test/scan/lexer_spec.ts index e5d4b0b73..d4d1e5cf8 100644 --- a/packages/chevrotain/test/scan/lexer_spec.ts +++ b/packages/chevrotain/test/scan/lexer_spec.ts @@ -2236,6 +2236,7 @@ describe("debugging and messages and optimizations", () => { }); expect((alphaLexerSafeMode).charCodeToPatternIdxToConfig.defaultMode) .to.be.empty; + const safeModeResult = alphaLexerSafeMode.tokenize("a"); // compare to safeMode disabled const alphaLexerNoSafeMode = new Lexer([Alpha], { @@ -2245,6 +2246,106 @@ describe("debugging and messages and optimizations", () => { (alphaLexerNoSafeMode).charCodeToPatternIdxToConfig .defaultMode[97][0].tokenType, ).to.equal(Alpha); + const noSafeModeResult = alphaLexerNoSafeMode.tokenize("a"); + expect(safeModeResult).to.deep.equal(noSafeModeResult); + }); + + it("won't optimize with safe mode enabled - multi mode lexer", () => { + const Alpha = createToken({ + name: "A", + pattern: /a/, + push_mode: "b", + }); + const Beta = createToken({ + name: "B", + pattern: /b/, + pop_mode: true, + }); + const tokens = { + modes: { + a: [Alpha], + b: [Beta], + }, + defaultMode: "a", + }; + const text = "abab"; + const lexerSafeMode = new Lexer(tokens, { + positionTracking: "onlyOffset", + safeMode: true, + }); + expect((lexerSafeMode).charCodeToPatternIdxToConfig.a).to.be.empty; + const safeModeResult = lexerSafeMode.tokenize(text); + + // compare to safeMode disabled + const lexerNoSafeMode = new Lexer(tokens, { + positionTracking: "onlyOffset", + }); + expect( + (lexerNoSafeMode).charCodeToPatternIdxToConfig.a[97][0].tokenType, + ).to.equal(Alpha); + const noSafeModeResult = lexerNoSafeMode.tokenize(text); + expect(safeModeResult).to.deep.equal(noSafeModeResult); + }); + + context("lexer optimization", () => { + const dFunction = (text: string, offset: number) => { + if (text.charAt(offset) === "d") { + return ["d"] as [string]; + } else { + return null; + } + }; + + for (const [name, pattern] of [ + ["function", dFunction], + ["unicode regexp", /d/u], + ["lookbehind regexp", /(? { + const Alpha = createToken({ + name: "A", + pattern: "a", + }); + const Beta = createToken({ + name: "B", + pattern: "b", + }); + const Delta = createToken({ + name: "D", + pattern, + }); + const optimizedLexer = new Lexer([Alpha, Delta, Beta], { + positionTracking: "onlyOffset", + }); + // Assert that the pattern will be added to all character codes + // Also assert that the ordering gets preserved + expect( + (optimizedLexer).charCodeToPatternIdxToConfig.defaultMode[ + "a".charCodeAt(0) + ].map((e: any) => e.tokenType), + ).to.deep.equal([Alpha, Delta]); + expect( + (optimizedLexer).charCodeToPatternIdxToConfig.defaultMode[ + "b".charCodeAt(0) + ].map((e: any) => e.tokenType), + ).to.deep.equal([Delta, Beta]); + // The lexer cannot identify that the pattern is only for the character 'd' + expect( + (optimizedLexer).charCodeToPatternIdxToConfig.defaultMode[ + "d".charCodeAt(0) + ], + ).to.be.undefined; + expect(optimizedLexer.tokenize("a").tokens[0].tokenType).to.deep.equal( + Alpha, + ); + expect(optimizedLexer.tokenize("b").tokens[0].tokenType).to.deep.equal( + Beta, + ); + expect(optimizedLexer.tokenize("d").tokens[0].tokenType).to.deep.equal( + Delta, + ); + }); + } }); });