Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: optimize unoptimizable token types #2072

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 57 additions & 16 deletions packages/chevrotain/src/scan/lexer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ export interface IAnalyzeResult {
emptyGroups: { [groupName: string]: IToken[] };
hasCustom: boolean;
canBeOptimized: boolean;
unoptimizedPatterns: IPatternConfig[];
}

export let SUPPORT_STICKY =
Expand Down Expand Up @@ -306,6 +307,7 @@ export function analyzeTokenTypes(
});

let canBeOptimized = true;
let unoptimizedPatterns: IPatternConfig[] = [];
let charCodeToPatternIdxToConfig: { [charCode: number]: IPatternConfig[] } =
[];

Expand All @@ -317,7 +319,12 @@ export function analyzeTokenTypes(
if (typeof currTokType.PATTERN === "string") {
const charCode = currTokType.PATTERN.charCodeAt(0);
const optimizedIdx = charCodeToOptimizedIndex(charCode);
addToMapOfArrays(result, optimizedIdx, patternIdxToConfig[idx]);
addToMapOfArrays(
result,
optimizedIdx,
patternIdxToConfig[idx],
unoptimizedPatterns,
);
} else if (isArray(currTokType.START_CHARS_HINT)) {
let lastOptimizedIdx: number;
forEach(currTokType.START_CHARS_HINT, (charOrInt) => {
Expand All @@ -336,49 +343,81 @@ export function analyzeTokenTypes(
result,
currOptimizedIdx,
patternIdxToConfig[idx],
unoptimizedPatterns,
);
}
});
} else if (isRegExp(currTokType.PATTERN)) {
if (currTokType.PATTERN.unicode) {
canBeOptimized = false;
forEach(Object.keys(result), (code) => {
addToMapOfArrays(
result,
Number(code),
patternIdxToConfig[idx],
unoptimizedPatterns,
);
});
unoptimizedPatterns.push(patternIdxToConfig[idx]);
if (options.ensureOptimizations) {
PRINT_ERROR(
`${failedOptimizationPrefixMsg}` +
`\tUnable to analyze < ${currTokType.PATTERN.toString()} > pattern.\n` +
"\tThe regexp unicode flag is not currently supported by the regexp-to-ast library.\n" +
"\tThis will disable the lexer's first char optimizations.\n" +
"\tThis reduces lexer performance.\n" +
"\tFor details See: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#UNICODE_OPTIMIZE",
);
}
canBeOptimized = false;
} else {
const optimizedCodes = getOptimizedStartCodesIndices(
currTokType.PATTERN,
options.ensureOptimizations,
);
/* istanbul ignore if */
// start code will only be empty given an empty regExp or failure of regexp-to-ast library
// the first should be a different validation and the second cannot be tested.
if (isEmpty(optimizedCodes)) {
// we cannot understand what codes may start possible matches
// The optimization correctness requires knowing start codes for ALL patterns.
// Not actually sure this is an error, no debug message
// instead, simply add the token to all known start characters
forEach(Object.keys(result), (code) => {
addToMapOfArrays(
result,
Number(code),
patternIdxToConfig[idx],
unoptimizedPatterns,
);
});
unoptimizedPatterns.push(patternIdxToConfig[idx]);
canBeOptimized = false;
} else {
forEach(optimizedCodes, (code) => {
addToMapOfArrays(
result,
code,
patternIdxToConfig[idx],
unoptimizedPatterns,
);
});
}
forEach(optimizedCodes, (code) => {
addToMapOfArrays(result, code, patternIdxToConfig[idx]);
});
}
} else {
if (options.ensureOptimizations) {
PRINT_ERROR(
`${failedOptimizationPrefixMsg}` +
`\tTokenType: <${currTokType.name}> is using a custom token pattern without providing <start_chars_hint> parameter.\n` +
"\tThis will disable the lexer's first char optimizations.\n" +
"\tThis reduces lexer performance.\n" +
"\tFor details See: https://chevrotain.io/docs/guide/resolving_lexer_errors.html#CUSTOM_OPTIMIZE",
);
}
canBeOptimized = false;
forEach(Object.keys(result), (code) => {
addToMapOfArrays(
result,
Number(code),
patternIdxToConfig[idx],
unoptimizedPatterns,
);
});
unoptimizedPatterns.push(patternIdxToConfig[idx]);
}

return result;
Expand All @@ -389,11 +428,12 @@ export function analyzeTokenTypes(
}

return {
emptyGroups: emptyGroups,
patternIdxToConfig: patternIdxToConfig,
charCodeToPatternIdxToConfig: charCodeToPatternIdxToConfig,
hasCustom: hasCustom,
canBeOptimized: canBeOptimized,
emptyGroups,
patternIdxToConfig,
charCodeToPatternIdxToConfig,
hasCustom,
canBeOptimized,
unoptimizedPatterns,
};
}

Expand Down Expand Up @@ -1125,9 +1165,10 @@ function addToMapOfArrays<T>(
map: Record<number, T[]>,
key: number,
value: T,
initial: T[],
): void {
if (map[key] === undefined) {
map[key] = [value];
map[key] = [...initial, value];
} else {
map[key].push(value);
}
Expand Down
30 changes: 20 additions & 10 deletions packages/chevrotain/src/scan/lexer_public.ts
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ export class Lexer {
public lexerDefinitionWarning: ILexerDefinitionError[] = [];

protected patternIdxToConfig: Record<string, IPatternConfig[]> = {};
protected unoptimizedPatterns: Record<string, IPatternConfig[]> = {};
protected charCodeToPatternIdxToConfig: {
[modeName: string]: { [charCode: number]: IPatternConfig[] };
} = {};
Expand Down Expand Up @@ -261,6 +262,9 @@ export class Lexer {
this.charCodeToPatternIdxToConfig[currModName] =
currAnalyzeResult.charCodeToPatternIdxToConfig;

this.unoptimizedPatterns[currModName] =
currAnalyzeResult.unoptimizedPatterns;

this.emptyGroups = assign(
{},
this.emptyGroups,
Expand Down Expand Up @@ -344,6 +348,11 @@ export class Lexer {
});

this.TRACE_INIT("Failed Optimization Warnings", () => {
if (config.ensureOptimizations !== true) {
// Return early
return;
}

const unOptimizedModes = reduce(
this.canModeBeOptimized,
(cannotBeOptimized, canBeOptimized, modeName) => {
Expand All @@ -355,7 +364,7 @@ export class Lexer {
[] as string[],
);

if (config.ensureOptimizations && !isEmpty(unOptimizedModes)) {
if (!isEmpty(unOptimizedModes)) {
throw Error(
`Lexer Modes: < ${unOptimizedModes.join(
", ",
Expand Down Expand Up @@ -438,14 +447,13 @@ export class Lexer {

let currModePatternsLength = 0;
let patternIdxToConfig: IPatternConfig[] = [];
let unoptimizedPatterns: IPatternConfig[] = [];
let currCharCodeToPatternIdxToConfig: {
[charCode: number]: IPatternConfig[];
} = [];

const modeStack: string[] = [];

const emptyArray: IPatternConfig[] = [];
Object.freeze(emptyArray);
let getPossiblePatterns!: (charCode: number) => IPatternConfig[];

function getPossiblePatternsSlow() {
Expand All @@ -457,7 +465,7 @@ export class Lexer {
const possiblePatterns =
currCharCodeToPatternIdxToConfig[optimizedCharIdx];
if (possiblePatterns === undefined) {
return emptyArray;
return unoptimizedPatterns;
} else {
return possiblePatterns;
}
Expand Down Expand Up @@ -492,10 +500,12 @@ export class Lexer {
currCharCodeToPatternIdxToConfig =
this.charCodeToPatternIdxToConfig[newMode];
currModePatternsLength = patternIdxToConfig.length;
const modeCanBeOptimized =
this.canModeBeOptimized[newMode] && this.config.safeMode === false;
unoptimizedPatterns = this.unoptimizedPatterns[newMode];

if (currCharCodeToPatternIdxToConfig && modeCanBeOptimized) {
if (
currCharCodeToPatternIdxToConfig &&
this.config.safeMode === false
) {
getPossiblePatterns = getPossiblePatternsOptimized;
} else {
getPossiblePatterns = getPossiblePatternsSlow;
Expand All @@ -508,14 +518,14 @@ export class Lexer {
currCharCodeToPatternIdxToConfig =
this.charCodeToPatternIdxToConfig[newMode];

unoptimizedPatterns = this.unoptimizedPatterns[newMode];

patternIdxToConfig = this.patternIdxToConfig[newMode];
currModePatternsLength = patternIdxToConfig.length;

currModePatternsLength = patternIdxToConfig.length;
const modeCanBeOptimized =
this.canModeBeOptimized[newMode] && this.config.safeMode === false;

if (currCharCodeToPatternIdxToConfig && modeCanBeOptimized) {
if (currCharCodeToPatternIdxToConfig && this.config.safeMode === false) {
getPossiblePatterns = getPossiblePatternsOptimized;
} else {
getPossiblePatterns = getPossiblePatternsSlow;
Expand Down
101 changes: 101 additions & 0 deletions packages/chevrotain/test/scan/lexer_spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2236,6 +2236,7 @@ describe("debugging and messages and optimizations", () => {
});
expect((<any>alphaLexerSafeMode).charCodeToPatternIdxToConfig.defaultMode)
.to.be.empty;
const safeModeResult = alphaLexerSafeMode.tokenize("a");

// compare to safeMode disabled
const alphaLexerNoSafeMode = new Lexer([Alpha], {
Expand All @@ -2245,6 +2246,106 @@ describe("debugging and messages and optimizations", () => {
(<any>alphaLexerNoSafeMode).charCodeToPatternIdxToConfig
.defaultMode[97][0].tokenType,
).to.equal(Alpha);
const noSafeModeResult = alphaLexerNoSafeMode.tokenize("a");
expect(safeModeResult).to.deep.equal(noSafeModeResult);
});

it("won't optimize with safe mode enabled - multi mode lexer", () => {
const Alpha = createToken({
name: "A",
pattern: /a/,
push_mode: "b",
});
const Beta = createToken({
name: "B",
pattern: /b/,
pop_mode: true,
});
const tokens = {
modes: {
a: [Alpha],
b: [Beta],
},
defaultMode: "a",
};
const text = "abab";
const lexerSafeMode = new Lexer(tokens, {
positionTracking: "onlyOffset",
safeMode: true,
});
expect((<any>lexerSafeMode).charCodeToPatternIdxToConfig.a).to.be.empty;
const safeModeResult = lexerSafeMode.tokenize(text);

// compare to safeMode disabled
const lexerNoSafeMode = new Lexer(tokens, {
positionTracking: "onlyOffset",
});
expect(
(<any>lexerNoSafeMode).charCodeToPatternIdxToConfig.a[97][0].tokenType,
).to.equal(Alpha);
const noSafeModeResult = lexerNoSafeMode.tokenize(text);
expect(safeModeResult).to.deep.equal(noSafeModeResult);
});

context("lexer optimization", () => {
const dFunction = (text: string, offset: number) => {
if (text.charAt(offset) === "d") {
return ["d"] as [string];
} else {
return null;
}
};

for (const [name, pattern] of [
["function", dFunction],
["unicode regexp", /d/u],
["lookbehind regexp", /(?<!a)d/],
]) {
it(`will optimize ${name} pattern`, () => {
const Alpha = createToken({
name: "A",
pattern: "a",
});
const Beta = createToken({
name: "B",
pattern: "b",
});
const Delta = createToken({
name: "D",
pattern,
});
const optimizedLexer = new Lexer([Alpha, Delta, Beta], {
positionTracking: "onlyOffset",
});
// Assert that the pattern will be added to all character codes
// Also assert that the ordering gets preserved
expect(
(<any>optimizedLexer).charCodeToPatternIdxToConfig.defaultMode[
"a".charCodeAt(0)
].map((e: any) => e.tokenType),
).to.deep.equal([Alpha, Delta]);
expect(
(<any>optimizedLexer).charCodeToPatternIdxToConfig.defaultMode[
"b".charCodeAt(0)
].map((e: any) => e.tokenType),
).to.deep.equal([Delta, Beta]);
// The lexer cannot identify that the pattern is only for the character 'd'
expect(
(<any>optimizedLexer).charCodeToPatternIdxToConfig.defaultMode[
"d".charCodeAt(0)
],
).to.be.undefined;
expect(optimizedLexer.tokenize("a").tokens[0].tokenType).to.deep.equal(
Alpha,
);
expect(optimizedLexer.tokenize("b").tokens[0].tokenType).to.deep.equal(
Beta,
);
expect(optimizedLexer.tokenize("d").tokens[0].tokenType).to.deep.equal(
Delta,
);
});
}
});
});

Expand Down