Skip to content

Commit

Permalink
feat: Parsing Let Statements (#18)
Browse files Browse the repository at this point in the history
* Lexer tests and token struct

* fix: module naming and import resolution and start of lexer

Because we don't have our package published / setup to publish properly, we can't pull a package from the github domain.

* Functions for lexer to read chars in strings. Added tests.

* Support for parsing numbers and eating white space

* Added more tokens to lexer. Created lookahead functionality.

* Implementation of REPL. Can run from command line.

* ast and parser packages

Created ast package which contains base, statement, expression node interfaces.
Also created parser package which imports the ast, lexer, and token packages, contains token functions, statement functions, peek functions, has an error function, and began the ParseProgram() function. Created parser_test.go package as well.

* fix: Remove IntelliJ Editor Project config files

* style: apply go fmt

* feat(docs): inline docs to all existing packages

---------

Co-authored-by: Joseph Porrino <joeyporrino1998@gmail.com>
Co-authored-by: Dinesh Umasankar <dinesh71uma@gmail.com>
  • Loading branch information
3 people authored Apr 7, 2024
1 parent c6580c4 commit 06ee0fa
Show file tree
Hide file tree
Showing 7 changed files with 360 additions and 3 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,6 @@ go.work

presentations/**/node_modules
presentations/**/dist

# IntelliJ Editor files
.idea/
104 changes: 104 additions & 0 deletions src/ast/ast.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
Package ast provides functionality to represent YARTBML Programs as an
Abstract Syntax Tree (Parse Tree).
Programs in YARTBML are a series of statements.
A fully valid program written in YARTBML is the following:
let x = 10;
let y = 15;
let add = fn(a, b) {
return a + b;
}
We can see three statements, three variable binding - let statements of the following form:
let <identifier> = <expression>;
A let statement consists of two changing parts: an identifer and and an expression.
In the example above, x and y and add are identifiers. 10, 15, and the function literal are expressions.
The difference between an expression and a statement is the following: Expressions produce values and statements don't.
A `return 5;` statement doesn't produce a value, but add(5, 5) does.
We will be using this AST (of statements and expressions) and apply Pratt Parsing in for our language.
*/
package ast

import (
"YARTBML/token"
)

// Nodes are going to contain our language's construct of
// "Expression(s)" or "Statement(s)". Each node will be used
// to build our AST (Abstract Syntax Tree) aka Parse Tree.
// Every node will have provide the literal value of the token
// it is associated with. The method itself will be used solely
// for debugging purposes.
type Node interface {
TokenLiteral() string
}

// Statement don't produce a value but represents an object identifier that
// doesn't return a value explicitly.
type Statement interface {
Node
statementNode()
}

// Expressions produce values that should be handled.
type Expression interface {
Node
expressionNode()
}

// Our programs are a series of statements.
// This is our root node for our ast.
type Program struct {
Statements []Statement
}

func (p *Program) TokenLiteral() string {
if len(p.Statements) > 0 {
return p.Statements[0].TokenLiteral()
} else {
return ""
}
}

// Represents a Let "Statement" within our AST to indicate an identifier
// that holds a value. A Let Statement has `Name` to hold the identifier
// of the binding and `Value` for the expression that produces the value.
type LetStatement struct {
Token token.Token // token.LET token
Name *Identifier
Value Expression
}

// Implementing the Statement interface on LetStatement
func (ls *LetStatement) statementNode() {}

// Implementing the Node interface on LetStatement
func (ls *LetStatement) TokenLiteral() string {
return ls.Token.Literal
}

// Holds identifier of the binding in the [LetStatement]
// the x in `let x = 5;`. The value would be the name of the
// identifier in the [LetStatement].
type Identifier struct {
Token token.Token // token.IDENT token
Value string
}

// Implementing the Expression on an Identifer, as when the
// identifier is referenced in other parts of a program, it
// will produce a value.
func (i *Identifier) expressionNode() {}

// Implementing the Node interface on the IdentiferExpression
func (i *Identifier) TokenLiteral() string {
return i.Token.Literal
}
21 changes: 20 additions & 1 deletion src/lexer/lexer.go
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
// Package lexer provides functionality to tokenize input strings into tokens in the YARTBML Programming Language.
// The lexer (lexical analyzer) reads the input string character by character, identifying tokens such as identifiers,
// keywords, operators, and literals, and creating corresponding tokens.
// Each token has a type and a literal value associated with it.
package lexer

import "YARTBML/token"
Expand All @@ -9,12 +13,15 @@ type Lexer struct {
ch byte // current char under examination
}

// Initialize a new Lexer with the given program contents as a string input.
func New(input string) *Lexer {
l := &Lexer{input: input}
l.readChar()
return l
}

// Reads the next character from the input string
// and advances the lexer's position.
func (l *Lexer) readChar() {
if l.readPosition >= len(l.input) {
l.ch = 0
Expand All @@ -25,10 +32,12 @@ func (l *Lexer) readChar() {
l.readPosition += 1
}

// Create a new token with the given `TokenType` and character.
func newToken(tokenType token.TokenType, ch byte) token.Token {
return token.Token{Type: tokenType, Literal: string(ch)}
}

// Returns the NextToken from the input string (program contents).
func (l *Lexer) NextToken() token.Token {
var tok token.Token
l.skipWhitespace()
Expand Down Expand Up @@ -95,6 +104,9 @@ func (l *Lexer) NextToken() token.Token {
return tok
}

// When a series of letters are encountered, the assumption
// is that unless if it is a keyword, then it reads it as an
// identifier token.
func (l *Lexer) readIdentifier() string {
position := l.position
for isLetter(l.ch) {
Expand All @@ -103,6 +115,9 @@ func (l *Lexer) readIdentifier() string {
return l.input[position:l.position]
}

// When a series of numbers are encountered, the assumption
// is that a number literal has been encountered and returns
// a number literal.
func (l *Lexer) readNumber() string {
position := l.position
for isDigit(l.ch) {
Expand All @@ -111,25 +126,29 @@ func (l *Lexer) readNumber() string {
return l.input[position:l.position]
}

// Verifies if a given character is within this regex: [a-zA-Z_]
func isLetter(ch byte) bool {
return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_'
}

// Verifies if a given cahracter is within this regex: [0-9]
func isDigit(ch byte) bool {
return '0' <= ch && ch <= '9'
}

// Consume Whitespace equivalent method (including carriage return and linefeed)
// as our language isn't whitespace sensitive.
func (l *Lexer) skipWhitespace() {
for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' {
l.readChar()
}
}

// Returns the next character in the input string without advancing the lexer.
func (l *Lexer) peekChar() byte {
if l.readPosition >= len(l.input) {
return 0
} else {
return l.input[l.readPosition]
}
}

136 changes: 136 additions & 0 deletions src/parser/parser.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
// Package parser provides functionality to parse tokens into an abstract syntax tree (AST) in the YARTBML Programming Language.
// The parser analyzes tokens generated by the lexer and constructs an AST representing the program's structure.
// It defines grammar rules and recursively traverses the token stream to build the AST nodes.
// The implementation is a Top-Down Operator Precedence Parser (Pratt Parser).
package parser

import (
"YARTBML/ast"
"YARTBML/lexer"
"YARTBML/token"
"fmt"
)

// Parses each token received from the lexer and
// stores errors into a string array as they are spotted
// in the provided YARTBML program (UTF-8 string).
type Parser struct {
l *lexer.Lexer // Lexer instance for tokenization
errors []string // Parsing errors encountered

curToken token.Token // Current token being parsed
peekToken token.Token // Next token to be parsed
}

// Creates a new instance of the Parser with a given Lexer.
func New(l *lexer.Lexer) *Parser {
p := &Parser{
l: l,
errors: []string{},
}

// read two tokens, so curToken and peekToken are both set
// acts exactly like lexer's position and readPosition (for lookaheads)
p.nextToken()
p.nextToken()

return p
}

// Advances the parser to the next token.
func (p *Parser) nextToken() {
p.curToken = p.peekToken
p.peekToken = p.l.NextToken()
}

// Parses the entire program and constructs the ast.
// Iterates over every token in the input until EOF token is encountered.
// Since our programs are a series of statements, it attempts to parse every statement in a sequence.
func (p *Parser) ParseProgram() *ast.Program {
program := &ast.Program{}
program.Statements = []ast.Statement{}

for p.curToken.Type != token.EOF {
stmt := p.parseStatement()
if stmt != nil {
program.Statements = append(program.Statements, stmt)
}
p.nextToken()
}

return program
}

// Parses each statement and create a statement node and
// child Expression nodes based on the type of statement node
// encountered.
func (p *Parser) parseStatement() ast.Statement {
switch p.curToken.Type {
case token.LET:
return p.parseLetStatement()
default:
return nil
}
}

// Parse Let Statements down to Name-Identifier Node and Value-Expression Node
func (p *Parser) parseLetStatement() *ast.LetStatement {
// Construct LetStatement Node
stmt := &ast.LetStatement{Token: p.curToken}

if !p.expectPeek(token.IDENT) {
return nil
}

// Construct Identifier Node: IDENT token & Name of Identifier as Value
stmt.Name = &ast.Identifier{Token: p.curToken, Value: p.curToken.Literal}

if !p.expectPeek(token.ASSIGN) {
return nil
}

// TODO: Skipping expressions until we encounter a semicolon
// TODO: Construct Expression
for !p.curTokenIs(token.SEMICOLON) {
p.nextToken()
}

return stmt
}

// Check if currentToken's TokenType matches given TokenType
func (p *Parser) curTokenIs(t token.TokenType) bool {
return p.curToken.Type == t
}

// Checks if the peekToken's (nextToken) TokenType matches given TokenType
func (p *Parser) peekTokenIs(t token.TokenType) bool {
return p.peekToken.Type == t
}

// Checks if the nextToken is the given TokenType. Essentially, a lookahead by one
// in order to confirm the next token. If the given token is not expected, then
// we generate an error to append into the errors array that is part of the
// current [Parser] instance.
func (p *Parser) expectPeek(t token.TokenType) bool {
if p.peekTokenIs(t) {
p.nextToken()
return true
} else {
p.peekError(t)
return false
}
}

// Returns all parsing errors encountered
func (p *Parser) Errors() []string {
return p.errors
}

// Appends to errors property of the Parser Instance when the nextToken
// is not what is expected.
func (p *Parser) peekError(t token.TokenType) {
msg := fmt.Sprintf("expected next token to be %s, got %s instead",
t, p.peekToken.Type)
p.errors = append(p.errors, msg)
}
Loading

0 comments on commit 06ee0fa

Please sign in to comment.