From f0ddf90aeef2e2e4480718bfd5c34165490f7528 Mon Sep 17 00:00:00 2001 From: Shahar Soel Date: Mon, 3 Jul 2017 00:38:22 +0300 Subject: [PATCH] WIP custom lexer adapters. fixes #528 --- lexer_adapter.md | 3 + src/parse/parser_public.ts | 170 ++++++++++-------- test/full_flow/ecma_quirks/ecma_quirks.ts | 105 +++++++++++ .../full_flow/ecma_quirks/ecma_quirks_spec.ts | 0 4 files changed, 204 insertions(+), 74 deletions(-) create mode 100644 lexer_adapter.md create mode 100644 test/full_flow/ecma_quirks/ecma_quirks.ts create mode 100644 test/full_flow/ecma_quirks/ecma_quirks_spec.ts diff --git a/lexer_adapter.md b/lexer_adapter.md new file mode 100644 index 0000000000..38f4bf3b48 --- /dev/null +++ b/lexer_adapter.md @@ -0,0 +1,3 @@ +* parser.input method + - Maybe the parser should not be able to accept tokenVector array at constructor? + - the input should have an any (or generic T argument?) not specific to tokenVector \ No newline at end of file diff --git a/src/parse/parser_public.ts b/src/parse/parser_public.ts index d3ca62d48f..59b2bfcdb5 100644 --- a/src/parse/parser_public.ts +++ b/src/parse/parser_public.ts @@ -126,7 +126,16 @@ export type IgnoredRuleIssues = { [dslNameAndOccurrence: string]: boolean } export type IgnoredParserIssues = { [ruleName: string]: IgnoredRuleIssues } const IN_RULE_RECOVERY_EXCEPTION = "InRuleRecoveryException" -const END_OF_FILE = createTokenInstance(EOF, "", NaN, NaN, NaN, NaN, NaN, NaN) +export const END_OF_FILE = createTokenInstance( + EOF, + "", + NaN, + NaN, + NaN, + NaN, + NaN, + NaN +) Object.freeze(END_OF_FILE) export type TokenMatcher = ( @@ -543,11 +552,10 @@ export class Parser { protected maxLookahead: number protected ignoredIssues: IgnoredParserIssues protected outputCst: boolean + + // adapters protected errorMessageProvider: IErrorMessageProvider - protected _input: IToken[] = [] - protected inputIdx = -1 - protected savedTokenIdx = -1 protected isBackTrackingStack = [] protected className: string protected RULE_STACK: string[] = [] @@ -572,6 +580,12 @@ export class Parser { private LAST_EXPLICIT_RULE_STACK: number[] = [] private selfAnalysisDone = false + // lexerState + private tokVector: IToken[] + private tokVectorLength + private currIdx: number = -1 + private savedLexerState: number + /** * Only used internally for storing productions as they are built for the first time. * The final productions should be accessed from the static cache. @@ -586,7 +600,7 @@ export class Parser { | IMultiModeLexerDefinition, config: IParserConfig = DEFAULT_PARSER_CONFIG ) { - this._input = input + this.input = input // configuration this.recoveryEnabled = has(config, "recoveryEnabled") @@ -716,15 +730,6 @@ export class Parser { this._errors = newErrors } - public set input(newInput: IToken[]) { - this.reset() - this._input = newInput - } - - public get input(): IToken[] { - return cloneArr(this._input) - } - /** * Resets the parser state, should be overridden for custom parsers which "carry" additional state. * When overriding, remember to also invoke the super implementation! @@ -734,7 +739,6 @@ export class Parser { this.isBackTrackingStack = [] this.errors = [] - this._input = [] this.RULE_STACK = [] this.LAST_EXPLICIT_RULE_STACK = [] this.CST_STACK = [] @@ -899,19 +903,6 @@ export class Parser { } } - // skips a token and returns the next token - protected SKIP_TOKEN(): IToken { - // example: assume 45 tokens in the input, if input index is 44 it means that NEXT_TOKEN will return - // input[45] which is the 46th item and no longer exists, - // so in this case the largest valid input index is 43 (input.length - 2 ) - if (this.inputIdx <= this._input.length - 2) { - this.consumeToken() - return this.LA(1) - } else { - return END_OF_FILE - } - } - // Parsing DSL /** * Convenience method equivalent to CONSUME1. @@ -1830,49 +1821,6 @@ export class Parser { return consumedToken } - /** - * Convenience method equivalent to LA(1) - * It is no longer used directly in chevrotain due to - * performance considerations (avoid the need for inlining optimizations). - * - * But it is maintained for backward compatibility reasons. - * - * @deprecated - */ - protected NEXT_TOKEN(): IToken { - return this.LA(1) - } - - // Lexer (accessing Token vector) related methods which can be overridden to implement lazy lexers - // or lexers dependent on parser context. - protected LA(howMuch: number): IToken { - if (this._input.length <= this.inputIdx + howMuch) { - return END_OF_FILE - } else { - return this._input[this.inputIdx + howMuch] - } - } - - protected consumeToken() { - this.inputIdx++ - } - - protected saveLexerState() { - this.savedTokenIdx = this.inputIdx - } - - protected restoreLexerState() { - this.inputIdx = this.savedTokenIdx - } - - protected resetLexerState(): void { - this.inputIdx = -1 - } - - protected moveLexerStateToEnd(): void { - this.inputIdx = this.input.length - 1 - } - // other functionality private saveRecogState(): IParserState { // errors is a getter which will clone the errors array @@ -1880,7 +1828,7 @@ export class Parser { let savedRuleStack = cloneArr(this.RULE_STACK) return { errors: savedErrors, - lexerState: this.inputIdx, + lexerState: this.exportLexerState(), RULE_STACK: savedRuleStack, CST_STACK: this.CST_STACK, LAST_EXPLICIT_RULE_STACK: this.LAST_EXPLICIT_RULE_STACK @@ -1889,7 +1837,7 @@ export class Parser { private reloadRecogState(newState: IParserState) { this.errors = newState.errors - this.inputIdx = newState.lexerState + this.importLexerState(newState.lexerState) this.RULE_STACK = newState.RULE_STACK } @@ -1978,7 +1926,7 @@ export class Parser { } } else if (isFirstInvokedRule) { // otherwise a Redundant input error will be created as well and we cannot guarantee that this is indeed the case - this.moveLexerStateToEnd() + this.moveToTerminatedState() // the parser should never throw one of its own errors outside its flow. // even if error recovery is disabled return recoveryValueFunc() @@ -3221,6 +3169,80 @@ export class Parser { ruleCstResult ) } + + // lexer related methods + public set input(newInput: IToken[]) { + this.reset() + this.tokVector = newInput + this.tokVectorLength = newInput.length + } + + public get input(): IToken[] { + return this.tokVector + } + + // skips a token and returns the next token + protected SKIP_TOKEN(): IToken { + if (this.currIdx <= this.tokVector.length - 2) { + this.consumeToken() + return this.LA(1) + } else { + return END_OF_FILE + } + } + + /** + * Convenience method equivalent to LA(1) + * It is no longer used directly in chevrotain due to + * performance considerations (avoid the need for inlining optimizations). + * + * But it is maintained for backward compatibility reasons. + * + * @deprecated + */ + protected NEXT_TOKEN(): IToken { + return this.LA(1) + } + + // Lexer (accessing Token vector) related methods which can be overridden to implement lazy lexers + // or lexers dependent on parser context. + protected LA(howMuch: number): IToken { + // TODO: is this optimization (saving tokVectorLength benefits?) + if (this.tokVectorLength <= this.currIdx + howMuch) { + return END_OF_FILE + } else { + return this.tokVector[this.currIdx + howMuch] + } + } + + protected consumeToken() { + this.currIdx++ + } + + protected exportLexerState(): number { + return this.currIdx + } + + protected importLexerState(newState: number) { + this.currIdx = newState + } + + // TODO: use export/import to describe save/restore? + protected saveLexerState() { + this.savedLexerState = this.currIdx + } + + protected restoreLexerState() { + this.currIdx = this.savedLexerState + } + + protected resetLexerState(): void { + this.currIdx = -1 + } + + protected moveToTerminatedState(): void { + this.currIdx = this.tokVector.length - 1 + } } function InRuleRecoveryException(message: string) { diff --git a/test/full_flow/ecma_quirks/ecma_quirks.ts b/test/full_flow/ecma_quirks/ecma_quirks.ts new file mode 100644 index 0000000000..30794fb728 --- /dev/null +++ b/test/full_flow/ecma_quirks/ecma_quirks.ts @@ -0,0 +1,105 @@ +// Using TypeScript we have both classes and static properties to define Tokens + +import { createToken, IToken, Token } from "../../../src/scan/tokens_public" +import { Lexer } from "../../../src/scan/lexer_public" +import { END_OF_FILE, Parser } from "../../../src/parse/parser_public" + +const Return = createToken({ + name: "Return", + pattern: /return/y +}) + +const DivisionOperator = createToken({ + name: "DivisionOperator", + pattern: /\//y +}) + +const RegExpLiteral = createToken({ + name: "RegExpLiteral", + pattern: /\/\d+\//y +}) + +const NumberLiteral = createToken({ + name: "NumberLiteral", + pattern: /\d+/y +}) + +// todo differentiate line terminators and other whitespace? +const WhiteSpace = createToken({ + name: "WhiteSpace", + pattern: /\s+/y, + group: Lexer.SKIPPED, + line_breaks: true +}) + +const Semicolon = createToken({ + name: "Semicolon", + pattern: /;/y +}) + +const allTokens = [WhiteSpace, NumberLiteral] + +class EcmaScriptQuirksParser extends Parser { + constructor(input: Token[]) { + super(input, allTokens) + Parser.performSelfAnalysis(this) + } + + public statement = this.RULE("statement", () => { + this.CONSUME(Return) + this.SUBRULE(this.expression) + this.CONSUME(Semicolon) + }) + + public expression = this.RULE("expression", () => { + this.SUBRULE(this.atomic) + this.MANY(() => { + this.CONSUME(DivisionOperator) + this.SUBRULE2(this.atomic) + }) + }) + + public atomic = this.RULE("atomic", () => { + this.OR([ + { ALT: () => this.CONSUME(RegExpLiteral) }, + { ALT: () => this.CONSUME(NumberLiteral) } + ]) + }) + + private orgText + private textIdx + + // lexer related methods + public set textInput(newInput: string) { + this.reset() + this.orgText = newInput + } + + public get textInput(): string { + return this.orgText + } + + protected resetLexerState(): void { + this.textIdx = 0 + } + + protected LA(howMuch: number): IToken { + if (this.orgText.length <= this.textIdx) { + return END_OF_FILE + } else { + } + } +} + +// reuse the same parser instance. +const parser = new EcmaScriptQuirksParser([]) + +export function parse(text): any { + parser.input = text + let value = parser.statement() + + return { + value: value, + parseErrors: parser.errors + } +} diff --git a/test/full_flow/ecma_quirks/ecma_quirks_spec.ts b/test/full_flow/ecma_quirks/ecma_quirks_spec.ts new file mode 100644 index 0000000000..e69de29bb2