From f0ddf90aeef2e2e4480718bfd5c34165490f7528 Mon Sep 17 00:00:00 2001
From: Shahar Soel <shahar.soel@gmail.com>
Date: Mon, 3 Jul 2017 00:38:22 +0300
Subject: [PATCH] WIP custom lexer adapters.

fixes #528
---
 lexer_adapter.md                              |   3 +
 src/parse/parser_public.ts                    | 170 ++++++++++--------
 test/full_flow/ecma_quirks/ecma_quirks.ts     | 105 +++++++++++
 .../full_flow/ecma_quirks/ecma_quirks_spec.ts |   0
 4 files changed, 204 insertions(+), 74 deletions(-)
 create mode 100644 lexer_adapter.md
 create mode 100644 test/full_flow/ecma_quirks/ecma_quirks.ts
 create mode 100644 test/full_flow/ecma_quirks/ecma_quirks_spec.ts

diff --git a/lexer_adapter.md b/lexer_adapter.md
new file mode 100644
index 0000000000..38f4bf3b48
--- /dev/null
+++ b/lexer_adapter.md
@@ -0,0 +1,3 @@
+* parser.input method
+  - Maybe the parser should not be able to accept tokenVector array at constructor?
+  - the input should have an any (or generic T argument?) not specific to tokenVector
\ No newline at end of file
diff --git a/src/parse/parser_public.ts b/src/parse/parser_public.ts
index d3ca62d48f..59b2bfcdb5 100644
--- a/src/parse/parser_public.ts
+++ b/src/parse/parser_public.ts
@@ -126,7 +126,16 @@ export type IgnoredRuleIssues = { [dslNameAndOccurrence: string]: boolean }
 export type IgnoredParserIssues = { [ruleName: string]: IgnoredRuleIssues }
 
 const IN_RULE_RECOVERY_EXCEPTION = "InRuleRecoveryException"
-const END_OF_FILE = createTokenInstance(EOF, "", NaN, NaN, NaN, NaN, NaN, NaN)
+export const END_OF_FILE = createTokenInstance(
+    EOF,
+    "",
+    NaN,
+    NaN,
+    NaN,
+    NaN,
+    NaN,
+    NaN
+)
 Object.freeze(END_OF_FILE)
 
 export type TokenMatcher = (
@@ -543,11 +552,10 @@ export class Parser {
     protected maxLookahead: number
     protected ignoredIssues: IgnoredParserIssues
     protected outputCst: boolean
+
+    // adapters
     protected errorMessageProvider: IErrorMessageProvider
 
-    protected _input: IToken[] = []
-    protected inputIdx = -1
-    protected savedTokenIdx = -1
     protected isBackTrackingStack = []
     protected className: string
     protected RULE_STACK: string[] = []
@@ -572,6 +580,12 @@ export class Parser {
     private LAST_EXPLICIT_RULE_STACK: number[] = []
     private selfAnalysisDone = false
 
+    // lexerState
+    private tokVector: IToken[]
+    private tokVectorLength
+    private currIdx: number = -1
+    private savedLexerState: number
+
     /**
      * Only used internally for storing productions as they are built for the first time.
      * The final productions should be accessed from the static cache.
@@ -586,7 +600,7 @@ export class Parser {
             | IMultiModeLexerDefinition,
         config: IParserConfig = DEFAULT_PARSER_CONFIG
     ) {
-        this._input = input
+        this.input = input
 
         // configuration
         this.recoveryEnabled = has(config, "recoveryEnabled")
@@ -716,15 +730,6 @@ export class Parser {
         this._errors = newErrors
     }
 
-    public set input(newInput: IToken[]) {
-        this.reset()
-        this._input = newInput
-    }
-
-    public get input(): IToken[] {
-        return cloneArr(this._input)
-    }
-
     /**
      * Resets the parser state, should be overridden for custom parsers which "carry" additional state.
      * When overriding, remember to also invoke the super implementation!
@@ -734,7 +739,6 @@ export class Parser {
 
         this.isBackTrackingStack = []
         this.errors = []
-        this._input = []
         this.RULE_STACK = []
         this.LAST_EXPLICIT_RULE_STACK = []
         this.CST_STACK = []
@@ -899,19 +903,6 @@ export class Parser {
         }
     }
 
-    // skips a token and returns the next token
-    protected SKIP_TOKEN(): IToken {
-        // example: assume 45 tokens in the input, if input index is 44 it means that NEXT_TOKEN will return
-        // input[45] which is the 46th item and no longer exists,
-        // so in this case the largest valid input index is 43 (input.length - 2 )
-        if (this.inputIdx <= this._input.length - 2) {
-            this.consumeToken()
-            return this.LA(1)
-        } else {
-            return END_OF_FILE
-        }
-    }
-
     // Parsing DSL
     /**
      * Convenience method equivalent to CONSUME1.
@@ -1830,49 +1821,6 @@ export class Parser {
         return consumedToken
     }
 
-    /**
-     * Convenience method equivalent to LA(1)
-     * It is no longer used directly in chevrotain due to
-     * performance considerations (avoid the need for inlining optimizations).
-     *
-     * But it is maintained for backward compatibility reasons.
-     *
-     * @deprecated
-     */
-    protected NEXT_TOKEN(): IToken {
-        return this.LA(1)
-    }
-
-    // Lexer (accessing Token vector) related methods which can be overridden to implement lazy lexers
-    // or lexers dependent on parser context.
-    protected LA(howMuch: number): IToken {
-        if (this._input.length <= this.inputIdx + howMuch) {
-            return END_OF_FILE
-        } else {
-            return this._input[this.inputIdx + howMuch]
-        }
-    }
-
-    protected consumeToken() {
-        this.inputIdx++
-    }
-
-    protected saveLexerState() {
-        this.savedTokenIdx = this.inputIdx
-    }
-
-    protected restoreLexerState() {
-        this.inputIdx = this.savedTokenIdx
-    }
-
-    protected resetLexerState(): void {
-        this.inputIdx = -1
-    }
-
-    protected moveLexerStateToEnd(): void {
-        this.inputIdx = this.input.length - 1
-    }
-
     // other functionality
     private saveRecogState(): IParserState {
         // errors is a getter which will clone the errors array
@@ -1880,7 +1828,7 @@ export class Parser {
         let savedRuleStack = cloneArr(this.RULE_STACK)
         return {
             errors: savedErrors,
-            lexerState: this.inputIdx,
+            lexerState: this.exportLexerState(),
             RULE_STACK: savedRuleStack,
             CST_STACK: this.CST_STACK,
             LAST_EXPLICIT_RULE_STACK: this.LAST_EXPLICIT_RULE_STACK
@@ -1889,7 +1837,7 @@ export class Parser {
 
     private reloadRecogState(newState: IParserState) {
         this.errors = newState.errors
-        this.inputIdx = newState.lexerState
+        this.importLexerState(newState.lexerState)
         this.RULE_STACK = newState.RULE_STACK
     }
 
@@ -1978,7 +1926,7 @@ export class Parser {
                         }
                     } else if (isFirstInvokedRule) {
                         // otherwise a Redundant input error will be created as well and we cannot guarantee that this is indeed the case
-                        this.moveLexerStateToEnd()
+                        this.moveToTerminatedState()
                         // the parser should never throw one of its own errors outside its flow.
                         // even if error recovery is disabled
                         return recoveryValueFunc()
@@ -3221,6 +3169,80 @@ export class Parser {
             ruleCstResult
         )
     }
+
+    // lexer related methods
+    public set input(newInput: IToken[]) {
+        this.reset()
+        this.tokVector = newInput
+        this.tokVectorLength = newInput.length
+    }
+
+    public get input(): IToken[] {
+        return this.tokVector
+    }
+
+    // skips a token and returns the next token
+    protected SKIP_TOKEN(): IToken {
+        if (this.currIdx <= this.tokVector.length - 2) {
+            this.consumeToken()
+            return this.LA(1)
+        } else {
+            return END_OF_FILE
+        }
+    }
+
+    /**
+     * Convenience method equivalent to LA(1)
+     * It is no longer used directly in chevrotain due to
+     * performance considerations (avoid the need for inlining optimizations).
+     *
+     * But it is maintained for backward compatibility reasons.
+     *
+     * @deprecated
+     */
+    protected NEXT_TOKEN(): IToken {
+        return this.LA(1)
+    }
+
+    // Lexer (accessing Token vector) related methods which can be overridden to implement lazy lexers
+    // or lexers dependent on parser context.
+    protected LA(howMuch: number): IToken {
+        // TODO: is this optimization (saving tokVectorLength benefits?)
+        if (this.tokVectorLength <= this.currIdx + howMuch) {
+            return END_OF_FILE
+        } else {
+            return this.tokVector[this.currIdx + howMuch]
+        }
+    }
+
+    protected consumeToken() {
+        this.currIdx++
+    }
+
+    protected exportLexerState(): number {
+        return this.currIdx
+    }
+
+    protected importLexerState(newState: number) {
+        this.currIdx = newState
+    }
+
+    // TODO:  use export/import to describe save/restore?
+    protected saveLexerState() {
+        this.savedLexerState = this.currIdx
+    }
+
+    protected restoreLexerState() {
+        this.currIdx = this.savedLexerState
+    }
+
+    protected resetLexerState(): void {
+        this.currIdx = -1
+    }
+
+    protected moveToTerminatedState(): void {
+        this.currIdx = this.tokVector.length - 1
+    }
 }
 
 function InRuleRecoveryException(message: string) {
diff --git a/test/full_flow/ecma_quirks/ecma_quirks.ts b/test/full_flow/ecma_quirks/ecma_quirks.ts
new file mode 100644
index 0000000000..30794fb728
--- /dev/null
+++ b/test/full_flow/ecma_quirks/ecma_quirks.ts
@@ -0,0 +1,105 @@
+// Using TypeScript we have both classes and static properties to define Tokens
+
+import { createToken, IToken, Token } from "../../../src/scan/tokens_public"
+import { Lexer } from "../../../src/scan/lexer_public"
+import { END_OF_FILE, Parser } from "../../../src/parse/parser_public"
+
+const Return = createToken({
+    name: "Return",
+    pattern: /return/y
+})
+
+const DivisionOperator = createToken({
+    name: "DivisionOperator",
+    pattern: /\//y
+})
+
+const RegExpLiteral = createToken({
+    name: "RegExpLiteral",
+    pattern: /\/\d+\//y
+})
+
+const NumberLiteral = createToken({
+    name: "NumberLiteral",
+    pattern: /\d+/y
+})
+
+// todo differentiate line terminators and other whitespace?
+const WhiteSpace = createToken({
+    name: "WhiteSpace",
+    pattern: /\s+/y,
+    group: Lexer.SKIPPED,
+    line_breaks: true
+})
+
+const Semicolon = createToken({
+    name: "Semicolon",
+    pattern: /;/y
+})
+
+const allTokens = [WhiteSpace, NumberLiteral]
+
+class EcmaScriptQuirksParser extends Parser {
+    constructor(input: Token[]) {
+        super(input, allTokens)
+        Parser.performSelfAnalysis(this)
+    }
+
+    public statement = this.RULE("statement", () => {
+        this.CONSUME(Return)
+        this.SUBRULE(this.expression)
+        this.CONSUME(Semicolon)
+    })
+
+    public expression = this.RULE("expression", () => {
+        this.SUBRULE(this.atomic)
+        this.MANY(() => {
+            this.CONSUME(DivisionOperator)
+            this.SUBRULE2(this.atomic)
+        })
+    })
+
+    public atomic = this.RULE("atomic", () => {
+        this.OR([
+            { ALT: () => this.CONSUME(RegExpLiteral) },
+            { ALT: () => this.CONSUME(NumberLiteral) }
+        ])
+    })
+
+    private orgText
+    private textIdx
+
+    // lexer related methods
+    public set textInput(newInput: string) {
+        this.reset()
+        this.orgText = newInput
+    }
+
+    public get textInput(): string {
+        return this.orgText
+    }
+
+    protected resetLexerState(): void {
+        this.textIdx = 0
+    }
+
+    protected LA(howMuch: number): IToken {
+        if (this.orgText.length <= this.textIdx) {
+            return END_OF_FILE
+        } else {
+        }
+    }
+}
+
+// reuse the same parser instance.
+const parser = new EcmaScriptQuirksParser([])
+
+export function parse(text): any {
+    parser.input = text
+    let value = parser.statement()
+
+    return {
+        value: value,
+        parseErrors: parser.errors
+    }
+}
diff --git a/test/full_flow/ecma_quirks/ecma_quirks_spec.ts b/test/full_flow/ecma_quirks/ecma_quirks_spec.ts
new file mode 100644
index 0000000000..e69de29bb2