WIP custom lexer adapters.

fixes #528
Chevrotain · Jul 18, 2017 · f0ddf90 · f0ddf90
1 parent 16eec14
commit f0ddf90
Show file tree

Hide file tree

Showing 4 changed files with 204 additions and 74 deletions.
diff --git a/lexer_adapter.md b/lexer_adapter.md
@@ -0,0 +1,3 @@
+* parser.input method
+ - Maybe the parser should not be able to accept tokenVector array at constructor?
+ - the input should have an any (or generic T argument?) not specific to tokenVector
diff --git a/src/parse/parser_public.ts b/src/parse/parser_public.ts
@@ -126,7 +126,16 @@ export type IgnoredRuleIssues = { [dslNameAndOccurrence: string]: boolean }
 export type IgnoredParserIssues = { [ruleName: string]: IgnoredRuleIssues }
 
 const IN_RULE_RECOVERY_EXCEPTION = "InRuleRecoveryException"
-const END_OF_FILE = createTokenInstance(EOF, "", NaN, NaN, NaN, NaN, NaN, NaN)
+export const END_OF_FILE = createTokenInstance(
+ EOF,
+ "",
+ NaN,
+ NaN,
+ NaN,
+ NaN,
+ NaN,
+ NaN
+)
 Object.freeze(END_OF_FILE)
 
 export type TokenMatcher = (
@@ -543,11 +552,10 @@ export class Parser {
  protected maxLookahead: number
  protected ignoredIssues: IgnoredParserIssues
  protected outputCst: boolean
+
+ // adapters
  protected errorMessageProvider: IErrorMessageProvider
 
- protected _input: IToken[] = []
- protected inputIdx = -1
- protected savedTokenIdx = -1
  protected isBackTrackingStack = []
  protected className: string
  protected RULE_STACK: string[] = []
@@ -572,6 +580,12 @@ export class Parser {
  private LAST_EXPLICIT_RULE_STACK: number[] = []
  private selfAnalysisDone = false
 
+ // lexerState
+ private tokVector: IToken[]
+ private tokVectorLength
+ private currIdx: number = -1
+ private savedLexerState: number
+
  /**
  * Only used internally for storing productions as they are built for the first time.
  * The final productions should be accessed from the static cache.
@@ -586,7 +600,7 @@ export class Parser {
  | IMultiModeLexerDefinition,
  config: IParserConfig = DEFAULT_PARSER_CONFIG
  ) {
- this._input = input
+ this.input = input
 
  // configuration
  this.recoveryEnabled = has(config, "recoveryEnabled")
@@ -716,15 +730,6 @@ export class Parser {
  this._errors = newErrors
  }
 
- public set input(newInput: IToken[]) {
- this.reset()
- this._input = newInput
- }
-
- public get input(): IToken[] {
- return cloneArr(this._input)
- }
-
  /**
  * Resets the parser state, should be overridden for custom parsers which "carry" additional state.
  * When overriding, remember to also invoke the super implementation!
@@ -734,7 +739,6 @@ export class Parser {
 
  this.isBackTrackingStack = []
  this.errors = []
- this._input = []
  this.RULE_STACK = []
  this.LAST_EXPLICIT_RULE_STACK = []
  this.CST_STACK = []
@@ -899,19 +903,6 @@ export class Parser {
  }
  }
 
- // skips a token and returns the next token
- protected SKIP_TOKEN(): IToken {
- // example: assume 45 tokens in the input, if input index is 44 it means that NEXT_TOKEN will return
- // input[45] which is the 46th item and no longer exists,
- // so in this case the largest valid input index is 43 (input.length - 2 )
- if (this.inputIdx <= this._input.length - 2) {
- this.consumeToken()
- return this.LA(1)
- } else {
- return END_OF_FILE
- }
- }
-
  // Parsing DSL
  /**
  * Convenience method equivalent to CONSUME1.
@@ -1830,57 +1821,14 @@ export class Parser {
  return consumedToken
  }
 
- /**
- * Convenience method equivalent to LA(1)
- * It is no longer used directly in chevrotain due to
- * performance considerations (avoid the need for inlining optimizations).
- *
- * But it is maintained for backward compatibility reasons.
- *
- * @deprecated
- */
- protected NEXT_TOKEN(): IToken {
- return this.LA(1)
- }
-
- // Lexer (accessing Token vector) related methods which can be overridden to implement lazy lexers
- // or lexers dependent on parser context.
- protected LA(howMuch: number): IToken {
- if (this._input.length <= this.inputIdx + howMuch) {
- return END_OF_FILE
- } else {
- return this._input[this.inputIdx + howMuch]
- }
- }
-
- protected consumeToken() {
- this.inputIdx++
- }
-
- protected saveLexerState() {
- this.savedTokenIdx = this.inputIdx
- }
-
- protected restoreLexerState() {
- this.inputIdx = this.savedTokenIdx
- }
-
- protected resetLexerState(): void {
- this.inputIdx = -1
- }
-
- protected moveLexerStateToEnd(): void {
- this.inputIdx = this.input.length - 1
- }
-
  // other functionality
  private saveRecogState(): IParserState {
  // errors is a getter which will clone the errors array
  let savedErrors = this.errors
  let savedRuleStack = cloneArr(this.RULE_STACK)
  return {
  errors: savedErrors,
- lexerState: this.inputIdx,
+ lexerState: this.exportLexerState(),
  RULE_STACK: savedRuleStack,
  CST_STACK: this.CST_STACK,
  LAST_EXPLICIT_RULE_STACK: this.LAST_EXPLICIT_RULE_STACK
@@ -1889,7 +1837,7 @@ export class Parser {
 
  private reloadRecogState(newState: IParserState) {
  this.errors = newState.errors
- this.inputIdx = newState.lexerState
+ this.importLexerState(newState.lexerState)
  this.RULE_STACK = newState.RULE_STACK
  }
 
@@ -1978,7 +1926,7 @@ export class Parser {
  }
  } else if (isFirstInvokedRule) {
  // otherwise a Redundant input error will be created as well and we cannot guarantee that this is indeed the case
- this.moveLexerStateToEnd()
+ this.moveToTerminatedState()
  // the parser should never throw one of its own errors outside its flow.
  // even if error recovery is disabled
  return recoveryValueFunc()
@@ -3221,6 +3169,80 @@ export class Parser {
  ruleCstResult
  )
  }
+
+ // lexer related methods
+ public set input(newInput: IToken[]) {
+ this.reset()
+ this.tokVector = newInput
+ this.tokVectorLength = newInput.length
+ }
+
+ public get input(): IToken[] {
+ return this.tokVector
+ }
+
+ // skips a token and returns the next token
+ protected SKIP_TOKEN(): IToken {
+ if (this.currIdx <= this.tokVector.length - 2) {
+ this.consumeToken()
+ return this.LA(1)
+ } else {
+ return END_OF_FILE
+ }
+ }
+
+ /**
+ * Convenience method equivalent to LA(1)
+ * It is no longer used directly in chevrotain due to
+ * performance considerations (avoid the need for inlining optimizations).
+ *
+ * But it is maintained for backward compatibility reasons.
+ *
+ * @deprecated
+ */
+ protected NEXT_TOKEN(): IToken {
+ return this.LA(1)
+ }
+
+ // Lexer (accessing Token vector) related methods which can be overridden to implement lazy lexers
+ // or lexers dependent on parser context.
+ protected LA(howMuch: number): IToken {
+ // TODO: is this optimization (saving tokVectorLength benefits?)
+ if (this.tokVectorLength <= this.currIdx + howMuch) {
+ return END_OF_FILE
+ } else {
+ return this.tokVector[this.currIdx + howMuch]
+ }
+ }
+
+ protected consumeToken() {
+ this.currIdx++
+ }
+
+ protected exportLexerState(): number {
+ return this.currIdx
+ }
+
+ protected importLexerState(newState: number) {
+ this.currIdx = newState
+ }
+
+ // TODO: use export/import to describe save/restore?
+ protected saveLexerState() {
+ this.savedLexerState = this.currIdx
+ }
+
+ protected restoreLexerState() {
+ this.currIdx = this.savedLexerState
+ }
+
+ protected resetLexerState(): void {
+ this.currIdx = -1
+ }
+
+ protected moveToTerminatedState(): void {
+ this.currIdx = this.tokVector.length - 1
+ }
 }
 
 function InRuleRecoveryException(message: string) {

diff --git a/test/full_flow/ecma_quirks/ecma_quirks.ts b/test/full_flow/ecma_quirks/ecma_quirks.ts
@@ -0,0 +1,105 @@
+// Using TypeScript we have both classes and static properties to define Tokens
+
+import { createToken, IToken, Token } from "../../../src/scan/tokens_public"
+import { Lexer } from "../../../src/scan/lexer_public"
+import { END_OF_FILE, Parser } from "../../../src/parse/parser_public"
+
+const Return = createToken({
+ name: "Return",
+ pattern: /return/y
+})
+
+const DivisionOperator = createToken({
+ name: "DivisionOperator",
+ pattern: /\//y
+})
+
+const RegExpLiteral = createToken({
+ name: "RegExpLiteral",
+ pattern: /\/\d+\//y
+})
+
+const NumberLiteral = createToken({
+ name: "NumberLiteral",
+ pattern: /\d+/y
+})
+
+// todo differentiate line terminators and other whitespace?
+const WhiteSpace = createToken({
+ name: "WhiteSpace",
+ pattern: /\s+/y,
+ group: Lexer.SKIPPED,
+ line_breaks: true
+})
+
+const Semicolon = createToken({
+ name: "Semicolon",
+ pattern: /;/y
+})
+
+const allTokens = [WhiteSpace, NumberLiteral]
+
+class EcmaScriptQuirksParser extends Parser {
+ constructor(input: Token[]) {
+ super(input, allTokens)
+ Parser.performSelfAnalysis(this)
+ }
+
+ public statement = this.RULE("statement", () => {
+ this.CONSUME(Return)
+ this.SUBRULE(this.expression)
+ this.CONSUME(Semicolon)
+ })
+
+ public expression = this.RULE("expression", () => {
+ this.SUBRULE(this.atomic)
+ this.MANY(() => {
+ this.CONSUME(DivisionOperator)
+ this.SUBRULE2(this.atomic)
+ })
+ })
+
+ public atomic = this.RULE("atomic", () => {
+ this.OR([
+ { ALT: () => this.CONSUME(RegExpLiteral) },
+ { ALT: () => this.CONSUME(NumberLiteral) }
+ ])
+ })
+
+ private orgText
+ private textIdx
+
+ // lexer related methods
+ public set textInput(newInput: string) {
+ this.reset()
+ this.orgText = newInput
+ }
+
+ public get textInput(): string {
+ return this.orgText
+ }
+
+ protected resetLexerState(): void {
+ this.textIdx = 0
+ }
+
+ protected LA(howMuch: number): IToken {
+ if (this.orgText.length <= this.textIdx) {
+ return END_OF_FILE
+ } else {
+ }
+ }
+}
+
+// reuse the same parser instance.
+const parser = new EcmaScriptQuirksParser([])
+
+export function parse(text): any {
+ parser.input = text
+ let value = parser.statement()
+
+ return {
+ value: value,
+ parseErrors: parser.errors
+ }
+}
diff --git a/test/full_flow/ecma_quirks/ecma_quirks_spec.ts b/test/full_flow/ecma_quirks/ecma_quirks_spec.ts