Skip to content

Commit

Permalink
WIP custom lexer adapters.
Browse files Browse the repository at this point in the history
fixes #528
  • Loading branch information
Shahar Soel authored and bd82 committed Jul 18, 2017
1 parent 16eec14 commit f0ddf90
Show file tree
Hide file tree
Showing 4 changed files with 204 additions and 74 deletions.
3 changes: 3 additions & 0 deletions lexer_adapter.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
* parser.input method
- Maybe the parser should not be able to accept tokenVector array at constructor?
- the input should have an any (or generic T argument?) not specific to tokenVector
170 changes: 96 additions & 74 deletions src/parse/parser_public.ts
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,16 @@ export type IgnoredRuleIssues = { [dslNameAndOccurrence: string]: boolean }
export type IgnoredParserIssues = { [ruleName: string]: IgnoredRuleIssues }

const IN_RULE_RECOVERY_EXCEPTION = "InRuleRecoveryException"
const END_OF_FILE = createTokenInstance(EOF, "", NaN, NaN, NaN, NaN, NaN, NaN)
export const END_OF_FILE = createTokenInstance(
EOF,
"",
NaN,
NaN,
NaN,
NaN,
NaN,
NaN
)
Object.freeze(END_OF_FILE)

export type TokenMatcher = (
Expand Down Expand Up @@ -543,11 +552,10 @@ export class Parser {
protected maxLookahead: number
protected ignoredIssues: IgnoredParserIssues
protected outputCst: boolean

// adapters
protected errorMessageProvider: IErrorMessageProvider

protected _input: IToken[] = []
protected inputIdx = -1
protected savedTokenIdx = -1
protected isBackTrackingStack = []
protected className: string
protected RULE_STACK: string[] = []
Expand All @@ -572,6 +580,12 @@ export class Parser {
private LAST_EXPLICIT_RULE_STACK: number[] = []
private selfAnalysisDone = false

// lexerState
private tokVector: IToken[]
private tokVectorLength
private currIdx: number = -1
private savedLexerState: number

/**
* Only used internally for storing productions as they are built for the first time.
* The final productions should be accessed from the static cache.
Expand All @@ -586,7 +600,7 @@ export class Parser {
| IMultiModeLexerDefinition,
config: IParserConfig = DEFAULT_PARSER_CONFIG
) {
this._input = input
this.input = input

// configuration
this.recoveryEnabled = has(config, "recoveryEnabled")
Expand Down Expand Up @@ -716,15 +730,6 @@ export class Parser {
this._errors = newErrors
}

public set input(newInput: IToken[]) {
this.reset()
this._input = newInput
}

public get input(): IToken[] {
return cloneArr(this._input)
}

/**
* Resets the parser state, should be overridden for custom parsers which "carry" additional state.
* When overriding, remember to also invoke the super implementation!
Expand All @@ -734,7 +739,6 @@ export class Parser {

this.isBackTrackingStack = []
this.errors = []
this._input = []
this.RULE_STACK = []
this.LAST_EXPLICIT_RULE_STACK = []
this.CST_STACK = []
Expand Down Expand Up @@ -899,19 +903,6 @@ export class Parser {
}
}

// skips a token and returns the next token
protected SKIP_TOKEN(): IToken {
// example: assume 45 tokens in the input, if input index is 44 it means that NEXT_TOKEN will return
// input[45] which is the 46th item and no longer exists,
// so in this case the largest valid input index is 43 (input.length - 2 )
if (this.inputIdx <= this._input.length - 2) {
this.consumeToken()
return this.LA(1)
} else {
return END_OF_FILE
}
}

// Parsing DSL
/**
* Convenience method equivalent to CONSUME1.
Expand Down Expand Up @@ -1830,57 +1821,14 @@ export class Parser {
return consumedToken
}

/**
* Convenience method equivalent to LA(1)
* It is no longer used directly in chevrotain due to
* performance considerations (avoid the need for inlining optimizations).
*
* But it is maintained for backward compatibility reasons.
*
* @deprecated
*/
protected NEXT_TOKEN(): IToken {
return this.LA(1)
}

// Lexer (accessing Token vector) related methods which can be overridden to implement lazy lexers
// or lexers dependent on parser context.
protected LA(howMuch: number): IToken {
if (this._input.length <= this.inputIdx + howMuch) {
return END_OF_FILE
} else {
return this._input[this.inputIdx + howMuch]
}
}

protected consumeToken() {
this.inputIdx++
}

protected saveLexerState() {
this.savedTokenIdx = this.inputIdx
}

protected restoreLexerState() {
this.inputIdx = this.savedTokenIdx
}

protected resetLexerState(): void {
this.inputIdx = -1
}

protected moveLexerStateToEnd(): void {
this.inputIdx = this.input.length - 1
}

// other functionality
private saveRecogState(): IParserState {
// errors is a getter which will clone the errors array
let savedErrors = this.errors
let savedRuleStack = cloneArr(this.RULE_STACK)
return {
errors: savedErrors,
lexerState: this.inputIdx,
lexerState: this.exportLexerState(),
RULE_STACK: savedRuleStack,
CST_STACK: this.CST_STACK,
LAST_EXPLICIT_RULE_STACK: this.LAST_EXPLICIT_RULE_STACK
Expand All @@ -1889,7 +1837,7 @@ export class Parser {

private reloadRecogState(newState: IParserState) {
this.errors = newState.errors
this.inputIdx = newState.lexerState
this.importLexerState(newState.lexerState)
this.RULE_STACK = newState.RULE_STACK
}

Expand Down Expand Up @@ -1978,7 +1926,7 @@ export class Parser {
}
} else if (isFirstInvokedRule) {
// otherwise a Redundant input error will be created as well and we cannot guarantee that this is indeed the case
this.moveLexerStateToEnd()
this.moveToTerminatedState()
// the parser should never throw one of its own errors outside its flow.
// even if error recovery is disabled
return recoveryValueFunc()
Expand Down Expand Up @@ -3221,6 +3169,80 @@ export class Parser {
ruleCstResult
)
}

// lexer related methods
public set input(newInput: IToken[]) {
this.reset()
this.tokVector = newInput
this.tokVectorLength = newInput.length
}

public get input(): IToken[] {
return this.tokVector
}

// skips a token and returns the next token
protected SKIP_TOKEN(): IToken {
if (this.currIdx <= this.tokVector.length - 2) {
this.consumeToken()
return this.LA(1)
} else {
return END_OF_FILE
}
}

/**
* Convenience method equivalent to LA(1)
* It is no longer used directly in chevrotain due to
* performance considerations (avoid the need for inlining optimizations).
*
* But it is maintained for backward compatibility reasons.
*
* @deprecated
*/
protected NEXT_TOKEN(): IToken {
return this.LA(1)
}

// Lexer (accessing Token vector) related methods which can be overridden to implement lazy lexers
// or lexers dependent on parser context.
protected LA(howMuch: number): IToken {
// TODO: is this optimization (saving tokVectorLength benefits?)
if (this.tokVectorLength <= this.currIdx + howMuch) {
return END_OF_FILE
} else {
return this.tokVector[this.currIdx + howMuch]
}
}

protected consumeToken() {
this.currIdx++
}

protected exportLexerState(): number {
return this.currIdx
}

protected importLexerState(newState: number) {
this.currIdx = newState
}

// TODO: use export/import to describe save/restore?
protected saveLexerState() {
this.savedLexerState = this.currIdx
}

protected restoreLexerState() {
this.currIdx = this.savedLexerState
}

protected resetLexerState(): void {
this.currIdx = -1
}

protected moveToTerminatedState(): void {
this.currIdx = this.tokVector.length - 1
}
}

function InRuleRecoveryException(message: string) {
Expand Down
105 changes: 105 additions & 0 deletions test/full_flow/ecma_quirks/ecma_quirks.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
// Using TypeScript we have both classes and static properties to define Tokens

import { createToken, IToken, Token } from "../../../src/scan/tokens_public"
import { Lexer } from "../../../src/scan/lexer_public"
import { END_OF_FILE, Parser } from "../../../src/parse/parser_public"

const Return = createToken({
name: "Return",
pattern: /return/y
})

const DivisionOperator = createToken({
name: "DivisionOperator",
pattern: /\//y
})

const RegExpLiteral = createToken({
name: "RegExpLiteral",
pattern: /\/\d+\//y
})

const NumberLiteral = createToken({
name: "NumberLiteral",
pattern: /\d+/y
})

// todo differentiate line terminators and other whitespace?
const WhiteSpace = createToken({
name: "WhiteSpace",
pattern: /\s+/y,
group: Lexer.SKIPPED,
line_breaks: true
})

const Semicolon = createToken({
name: "Semicolon",
pattern: /;/y
})

const allTokens = [WhiteSpace, NumberLiteral]

class EcmaScriptQuirksParser extends Parser {
constructor(input: Token[]) {
super(input, allTokens)
Parser.performSelfAnalysis(this)
}

public statement = this.RULE("statement", () => {
this.CONSUME(Return)
this.SUBRULE(this.expression)
this.CONSUME(Semicolon)
})

public expression = this.RULE("expression", () => {
this.SUBRULE(this.atomic)
this.MANY(() => {
this.CONSUME(DivisionOperator)
this.SUBRULE2(this.atomic)
})
})

public atomic = this.RULE("atomic", () => {
this.OR([
{ ALT: () => this.CONSUME(RegExpLiteral) },
{ ALT: () => this.CONSUME(NumberLiteral) }
])
})

private orgText
private textIdx

// lexer related methods
public set textInput(newInput: string) {
this.reset()
this.orgText = newInput
}

public get textInput(): string {
return this.orgText
}

protected resetLexerState(): void {
this.textIdx = 0
}

protected LA(howMuch: number): IToken {
if (this.orgText.length <= this.textIdx) {
return END_OF_FILE
} else {
}
}
}

// reuse the same parser instance.
const parser = new EcmaScriptQuirksParser([])

export function parse(text): any {
parser.input = text
let value = parser.statement()

return {
value: value,
parseErrors: parser.errors
}
}
Empty file.

0 comments on commit f0ddf90

Please sign in to comment.