feat: Parsing Let Statements (#18)

* Lexer tests and token struct * fix: module naming and import resolution and start of lexer Because we don't have our package published / setup to publish properly, we can't pull a package from the github domain. * Functions for lexer to read chars in strings. Added tests. * Support for parsing numbers and eating white space * Added more tokens to lexer. Created lookahead functionality. * Implementation of REPL. Can run from command line. * ast and parser packages Created ast package which contains base, statement, expression node interfaces. Also created parser package which imports the ast, lexer, and token packages, contains token functions, statement functions, peek functions, has an error function, and began the ParseProgram() function. Created parser_test.go package as well. * fix: Remove IntelliJ Editor Project config files * style: apply go fmt * feat(docs): inline docs to all existing packages --------- Co-authored-by: Joseph Porrino <joeyporrino1998@gmail.com> Co-authored-by: Dinesh Umasankar <dinesh71uma@gmail.com>
dineshUmasankar · Apr 7, 2024 · 06ee0fa · 06ee0fa
1 parent c6580c4
commit 06ee0fa
Show file tree

Hide file tree

Showing 7 changed files with 360 additions and 3 deletions.
diff --git a/.gitignore b/.gitignore
@@ -22,3 +22,6 @@ go.work
 
 presentations/**/node_modules
 presentations/**/dist
+
+# IntelliJ Editor files
+.idea/
diff --git a/src/ast/ast.go b/src/ast/ast.go
@@ -0,0 +1,104 @@
+/*
+Package ast provides functionality to represent YARTBML Programs as an
+Abstract Syntax Tree (Parse Tree).
+
+Programs in YARTBML are a series of statements.
+
+A fully valid program written in YARTBML is the following:
+
+	let x = 10;
+	let y = 15;
+
+	let add = fn(a, b) {
+		return a + b;
+	}
+
+We can see three statements, three variable binding - let statements of the following form:
+
+	let <identifier> = <expression>;
+
+A let statement consists of two changing parts: an identifer and and an expression.
+In the example above, x and y and add are identifiers. 10, 15, and the function literal are expressions.
+
+The difference between an expression and a statement is the following: Expressions produce values and statements don't.
+A `return 5;` statement doesn't produce a value, but add(5, 5) does.
+
+We will be using this AST (of statements and expressions) and apply Pratt Parsing in for our language.
+*/
+package ast
+
+import (
+	"YARTBML/token"
+)
+
+// Nodes are going to contain our language's construct of
+// "Expression(s)" or "Statement(s)". Each node will be used
+// to build our AST (Abstract Syntax Tree) aka Parse Tree.
+// Every node will have provide the literal value of the token
+// it is associated with. The method itself will be used solely
+// for debugging purposes.
+type Node interface {
+	TokenLiteral() string
+}
+
+// Statement don't produce a value but represents an object identifier that
+// doesn't return a value explicitly.
+type Statement interface {
+	Node
+	statementNode()
+}
+
+// Expressions produce values that should be handled.
+type Expression interface {
+	Node
+	expressionNode()
+}
+
+// Our programs are a series of statements.
+// This is our root node for our ast.
+type Program struct {
+	Statements []Statement
+}
+
+func (p *Program) TokenLiteral() string {
+	if len(p.Statements) > 0 {
+		return p.Statements[0].TokenLiteral()
+	} else {
+		return ""
+	}
+}
+
+// Represents a Let "Statement" within our AST to indicate an identifier
+// that holds a value. A Let Statement has `Name` to hold the identifier
+// of the binding and `Value` for the expression that produces the value.
+type LetStatement struct {
+	Token token.Token // token.LET token
+	Name  *Identifier
+	Value Expression
+}
+
+// Implementing the Statement interface on LetStatement
+func (ls *LetStatement) statementNode() {}
+
+// Implementing the Node interface on LetStatement
+func (ls *LetStatement) TokenLiteral() string {
+	return ls.Token.Literal
+}
+
+// Holds identifier of the binding in the [LetStatement]
+// the x in `let x = 5;`. The value would be the name of the
+// identifier in the [LetStatement].
+type Identifier struct {
+	Token token.Token // token.IDENT token
+	Value string
+}
+
+// Implementing the Expression on an Identifer, as when the
+// identifier is referenced in other parts of a program, it
+// will produce a value.
+func (i *Identifier) expressionNode() {}
+
+// Implementing the Node interface on the IdentiferExpression
+func (i *Identifier) TokenLiteral() string {
+	return i.Token.Literal
+}
diff --git a/src/lexer/lexer.go b/src/lexer/lexer.go
@@ -1,3 +1,7 @@
+// Package lexer provides functionality to tokenize input strings into tokens in the YARTBML Programming Language.
+// The lexer (lexical analyzer) reads the input string character by character, identifying tokens such as identifiers,
+// keywords, operators, and literals, and creating corresponding tokens.
+// Each token has a type and a literal value associated with it.
 package lexer
 
 import "YARTBML/token"
@@ -9,12 +13,15 @@ type Lexer struct {
 	ch           byte // current char under examination
 }
 
+// Initialize a new Lexer with the given program contents as a string input.
 func New(input string) *Lexer {
 	l := &Lexer{input: input}
 	l.readChar()
 	return l
 }
 
+// Reads the next character from the input string
+// and advances the lexer's position.
 func (l *Lexer) readChar() {
 	if l.readPosition >= len(l.input) {
 		l.ch = 0
@@ -25,10 +32,12 @@ func (l *Lexer) readChar() {
 	l.readPosition += 1
 }
 
+// Create a new token with the given `TokenType` and character.
 func newToken(tokenType token.TokenType, ch byte) token.Token {
 	return token.Token{Type: tokenType, Literal: string(ch)}
 }
 
+// Returns the NextToken from the input string (program contents).
 func (l *Lexer) NextToken() token.Token {
 	var tok token.Token
 	l.skipWhitespace()
@@ -95,6 +104,9 @@ func (l *Lexer) NextToken() token.Token {
 	return tok
 }
 
+// When a series of letters are encountered, the assumption
+// is that unless if it is a keyword, then it reads it as an
+// identifier token.
 func (l *Lexer) readIdentifier() string {
 	position := l.position
 	for isLetter(l.ch) {
@@ -103,6 +115,9 @@ func (l *Lexer) readIdentifier() string {
 	return l.input[position:l.position]
 }
 
+// When a series of numbers are encountered, the assumption
+// is that a number literal has been encountered and returns
+// a number literal.
 func (l *Lexer) readNumber() string {
 	position := l.position
 	for isDigit(l.ch) {
@@ -111,25 +126,29 @@ func (l *Lexer) readNumber() string {
 	return l.input[position:l.position]
 }
 
+// Verifies if a given character is within this regex: [a-zA-Z_]
 func isLetter(ch byte) bool {
 	return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_'
 }
 
+// Verifies if a given cahracter is within this regex: [0-9]
 func isDigit(ch byte) bool {
 	return '0' <= ch && ch <= '9'
 }
 
+// Consume Whitespace equivalent method (including carriage return and linefeed)
+// as our language isn't whitespace sensitive.
 func (l *Lexer) skipWhitespace() {
 	for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' {
 		l.readChar()
 	}
 }
 
+// Returns the next character in the input string without advancing the lexer.
 func (l *Lexer) peekChar() byte {
 	if l.readPosition >= len(l.input) {
 		return 0
 	} else {
 		return l.input[l.readPosition]
 	}
 }
-
diff --git a/src/parser/parser.go b/src/parser/parser.go
@@ -0,0 +1,136 @@
+// Package parser provides functionality to parse tokens into an abstract syntax tree (AST) in the YARTBML Programming Language.
+// The parser analyzes tokens generated by the lexer and constructs an AST representing the program's structure.
+// It defines grammar rules and recursively traverses the token stream to build the AST nodes.
+// The implementation is a Top-Down Operator Precedence Parser (Pratt Parser).
+package parser
+
+import (
+	"YARTBML/ast"
+	"YARTBML/lexer"
+	"YARTBML/token"
+	"fmt"
+)
+
+// Parses each token received from the lexer and
+// stores errors into a string array as they are spotted
+// in the provided YARTBML program (UTF-8 string).
+type Parser struct {
+	l      *lexer.Lexer // Lexer instance for tokenization
+	errors []string     // Parsing errors encountered
+
+	curToken  token.Token // Current token being parsed
+	peekToken token.Token // Next token to be parsed
+}
+
+// Creates a new instance of the Parser with a given Lexer.
+func New(l *lexer.Lexer) *Parser {
+	p := &Parser{
+		l:      l,
+		errors: []string{},
+	}
+
+	// read two tokens, so curToken and peekToken are both set
+	// acts exactly like lexer's position and readPosition (for lookaheads)
+	p.nextToken()
+	p.nextToken()
+
+	return p
+}
+
+// Advances the parser to the next token.
+func (p *Parser) nextToken() {
+	p.curToken = p.peekToken
+	p.peekToken = p.l.NextToken()
+}
+
+// Parses the entire program and constructs the ast.
+// Iterates over every token in the input until EOF token is encountered.
+// Since our programs are a series of statements, it attempts to parse every statement in a sequence.
+func (p *Parser) ParseProgram() *ast.Program {
+	program := &ast.Program{}
+	program.Statements = []ast.Statement{}
+
+	for p.curToken.Type != token.EOF {
+		stmt := p.parseStatement()
+		if stmt != nil {
+			program.Statements = append(program.Statements, stmt)
+		}
+		p.nextToken()
+	}
+
+	return program
+}
+
+// Parses each statement and create a statement node and
+// child Expression nodes based on the type of statement node
+// encountered.
+func (p *Parser) parseStatement() ast.Statement {
+	switch p.curToken.Type {
+	case token.LET:
+		return p.parseLetStatement()
+	default:
+		return nil
+	}
+}
+
+// Parse Let Statements down to Name-Identifier Node and Value-Expression Node
+func (p *Parser) parseLetStatement() *ast.LetStatement {
+	// Construct LetStatement Node
+	stmt := &ast.LetStatement{Token: p.curToken}
+
+	if !p.expectPeek(token.IDENT) {
+		return nil
+	}
+
+	// Construct Identifier Node: IDENT token & Name of Identifier as Value
+	stmt.Name = &ast.Identifier{Token: p.curToken, Value: p.curToken.Literal}
+
+	if !p.expectPeek(token.ASSIGN) {
+		return nil
+	}
+
+	// TODO: Skipping expressions until we encounter a semicolon
+	// TODO: Construct Expression
+	for !p.curTokenIs(token.SEMICOLON) {
+		p.nextToken()
+	}
+
+	return stmt
+}
+
+// Check if currentToken's TokenType matches given TokenType
+func (p *Parser) curTokenIs(t token.TokenType) bool {
+	return p.curToken.Type == t
+}
+
+// Checks if the peekToken's (nextToken) TokenType matches given TokenType
+func (p *Parser) peekTokenIs(t token.TokenType) bool {
+	return p.peekToken.Type == t
+}
+
+// Checks if the nextToken is the given TokenType. Essentially, a lookahead by one
+// in order to confirm the next token. If the given token is not expected, then
+// we generate an error to append into the errors array that is part of the
+// current [Parser] instance.
+func (p *Parser) expectPeek(t token.TokenType) bool {
+	if p.peekTokenIs(t) {
+		p.nextToken()
+		return true
+	} else {
+		p.peekError(t)
+		return false
+	}
+}
+
+// Returns all parsing errors encountered
+func (p *Parser) Errors() []string {
+	return p.errors
+}
+
+// Appends to errors property of the Parser Instance when the nextToken
+// is not what is expected.
+func (p *Parser) peekError(t token.TokenType) {
+	msg := fmt.Sprintf("expected next token to be %s, got %s instead",
+		t, p.peekToken.Type)
+	p.errors = append(p.errors, msg)
+}