diff --git a/.gitignore b/.gitignore index 7c13750..04f58c0 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,6 @@ go.work presentations/**/node_modules presentations/**/dist + +# IntelliJ Editor files +.idea/ \ No newline at end of file diff --git a/src/ast/ast.go b/src/ast/ast.go new file mode 100644 index 0000000..14ae354 --- /dev/null +++ b/src/ast/ast.go @@ -0,0 +1,104 @@ +/* +Package ast provides functionality to represent YARTBML Programs as an +Abstract Syntax Tree (Parse Tree). + +Programs in YARTBML are a series of statements. + +A fully valid program written in YARTBML is the following: + + let x = 10; + let y = 15; + + let add = fn(a, b) { + return a + b; + } + +We can see three statements, three variable binding - let statements of the following form: + + let = ; + +A let statement consists of two changing parts: an identifer and and an expression. +In the example above, x and y and add are identifiers. 10, 15, and the function literal are expressions. + +The difference between an expression and a statement is the following: Expressions produce values and statements don't. +A `return 5;` statement doesn't produce a value, but add(5, 5) does. + +We will be using this AST (of statements and expressions) and apply Pratt Parsing in for our language. +*/ +package ast + +import ( + "YARTBML/token" +) + +// Nodes are going to contain our language's construct of +// "Expression(s)" or "Statement(s)". Each node will be used +// to build our AST (Abstract Syntax Tree) aka Parse Tree. +// Every node will have provide the literal value of the token +// it is associated with. The method itself will be used solely +// for debugging purposes. +type Node interface { + TokenLiteral() string +} + +// Statement don't produce a value but represents an object identifier that +// doesn't return a value explicitly. +type Statement interface { + Node + statementNode() +} + +// Expressions produce values that should be handled. +type Expression interface { + Node + expressionNode() +} + +// Our programs are a series of statements. +// This is our root node for our ast. +type Program struct { + Statements []Statement +} + +func (p *Program) TokenLiteral() string { + if len(p.Statements) > 0 { + return p.Statements[0].TokenLiteral() + } else { + return "" + } +} + +// Represents a Let "Statement" within our AST to indicate an identifier +// that holds a value. A Let Statement has `Name` to hold the identifier +// of the binding and `Value` for the expression that produces the value. +type LetStatement struct { + Token token.Token // token.LET token + Name *Identifier + Value Expression +} + +// Implementing the Statement interface on LetStatement +func (ls *LetStatement) statementNode() {} + +// Implementing the Node interface on LetStatement +func (ls *LetStatement) TokenLiteral() string { + return ls.Token.Literal +} + +// Holds identifier of the binding in the [LetStatement] +// the x in `let x = 5;`. The value would be the name of the +// identifier in the [LetStatement]. +type Identifier struct { + Token token.Token // token.IDENT token + Value string +} + +// Implementing the Expression on an Identifer, as when the +// identifier is referenced in other parts of a program, it +// will produce a value. +func (i *Identifier) expressionNode() {} + +// Implementing the Node interface on the IdentiferExpression +func (i *Identifier) TokenLiteral() string { + return i.Token.Literal +} diff --git a/src/lexer/lexer.go b/src/lexer/lexer.go index c8c818b..171ad2b 100644 --- a/src/lexer/lexer.go +++ b/src/lexer/lexer.go @@ -1,3 +1,7 @@ +// Package lexer provides functionality to tokenize input strings into tokens in the YARTBML Programming Language. +// The lexer (lexical analyzer) reads the input string character by character, identifying tokens such as identifiers, +// keywords, operators, and literals, and creating corresponding tokens. +// Each token has a type and a literal value associated with it. package lexer import "YARTBML/token" @@ -9,12 +13,15 @@ type Lexer struct { ch byte // current char under examination } +// Initialize a new Lexer with the given program contents as a string input. func New(input string) *Lexer { l := &Lexer{input: input} l.readChar() return l } +// Reads the next character from the input string +// and advances the lexer's position. func (l *Lexer) readChar() { if l.readPosition >= len(l.input) { l.ch = 0 @@ -25,10 +32,12 @@ func (l *Lexer) readChar() { l.readPosition += 1 } +// Create a new token with the given `TokenType` and character. func newToken(tokenType token.TokenType, ch byte) token.Token { return token.Token{Type: tokenType, Literal: string(ch)} } +// Returns the NextToken from the input string (program contents). func (l *Lexer) NextToken() token.Token { var tok token.Token l.skipWhitespace() @@ -95,6 +104,9 @@ func (l *Lexer) NextToken() token.Token { return tok } +// When a series of letters are encountered, the assumption +// is that unless if it is a keyword, then it reads it as an +// identifier token. func (l *Lexer) readIdentifier() string { position := l.position for isLetter(l.ch) { @@ -103,6 +115,9 @@ func (l *Lexer) readIdentifier() string { return l.input[position:l.position] } +// When a series of numbers are encountered, the assumption +// is that a number literal has been encountered and returns +// a number literal. func (l *Lexer) readNumber() string { position := l.position for isDigit(l.ch) { @@ -111,20 +126,25 @@ func (l *Lexer) readNumber() string { return l.input[position:l.position] } +// Verifies if a given character is within this regex: [a-zA-Z_] func isLetter(ch byte) bool { return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' } +// Verifies if a given cahracter is within this regex: [0-9] func isDigit(ch byte) bool { return '0' <= ch && ch <= '9' } +// Consume Whitespace equivalent method (including carriage return and linefeed) +// as our language isn't whitespace sensitive. func (l *Lexer) skipWhitespace() { for l.ch == ' ' || l.ch == '\t' || l.ch == '\n' || l.ch == '\r' { l.readChar() } } +// Returns the next character in the input string without advancing the lexer. func (l *Lexer) peekChar() byte { if l.readPosition >= len(l.input) { return 0 @@ -132,4 +152,3 @@ func (l *Lexer) peekChar() byte { return l.input[l.readPosition] } } - diff --git a/src/parser/parser.go b/src/parser/parser.go new file mode 100644 index 0000000..5525550 --- /dev/null +++ b/src/parser/parser.go @@ -0,0 +1,136 @@ +// Package parser provides functionality to parse tokens into an abstract syntax tree (AST) in the YARTBML Programming Language. +// The parser analyzes tokens generated by the lexer and constructs an AST representing the program's structure. +// It defines grammar rules and recursively traverses the token stream to build the AST nodes. +// The implementation is a Top-Down Operator Precedence Parser (Pratt Parser). +package parser + +import ( + "YARTBML/ast" + "YARTBML/lexer" + "YARTBML/token" + "fmt" +) + +// Parses each token received from the lexer and +// stores errors into a string array as they are spotted +// in the provided YARTBML program (UTF-8 string). +type Parser struct { + l *lexer.Lexer // Lexer instance for tokenization + errors []string // Parsing errors encountered + + curToken token.Token // Current token being parsed + peekToken token.Token // Next token to be parsed +} + +// Creates a new instance of the Parser with a given Lexer. +func New(l *lexer.Lexer) *Parser { + p := &Parser{ + l: l, + errors: []string{}, + } + + // read two tokens, so curToken and peekToken are both set + // acts exactly like lexer's position and readPosition (for lookaheads) + p.nextToken() + p.nextToken() + + return p +} + +// Advances the parser to the next token. +func (p *Parser) nextToken() { + p.curToken = p.peekToken + p.peekToken = p.l.NextToken() +} + +// Parses the entire program and constructs the ast. +// Iterates over every token in the input until EOF token is encountered. +// Since our programs are a series of statements, it attempts to parse every statement in a sequence. +func (p *Parser) ParseProgram() *ast.Program { + program := &ast.Program{} + program.Statements = []ast.Statement{} + + for p.curToken.Type != token.EOF { + stmt := p.parseStatement() + if stmt != nil { + program.Statements = append(program.Statements, stmt) + } + p.nextToken() + } + + return program +} + +// Parses each statement and create a statement node and +// child Expression nodes based on the type of statement node +// encountered. +func (p *Parser) parseStatement() ast.Statement { + switch p.curToken.Type { + case token.LET: + return p.parseLetStatement() + default: + return nil + } +} + +// Parse Let Statements down to Name-Identifier Node and Value-Expression Node +func (p *Parser) parseLetStatement() *ast.LetStatement { + // Construct LetStatement Node + stmt := &ast.LetStatement{Token: p.curToken} + + if !p.expectPeek(token.IDENT) { + return nil + } + + // Construct Identifier Node: IDENT token & Name of Identifier as Value + stmt.Name = &ast.Identifier{Token: p.curToken, Value: p.curToken.Literal} + + if !p.expectPeek(token.ASSIGN) { + return nil + } + + // TODO: Skipping expressions until we encounter a semicolon + // TODO: Construct Expression + for !p.curTokenIs(token.SEMICOLON) { + p.nextToken() + } + + return stmt +} + +// Check if currentToken's TokenType matches given TokenType +func (p *Parser) curTokenIs(t token.TokenType) bool { + return p.curToken.Type == t +} + +// Checks if the peekToken's (nextToken) TokenType matches given TokenType +func (p *Parser) peekTokenIs(t token.TokenType) bool { + return p.peekToken.Type == t +} + +// Checks if the nextToken is the given TokenType. Essentially, a lookahead by one +// in order to confirm the next token. If the given token is not expected, then +// we generate an error to append into the errors array that is part of the +// current [Parser] instance. +func (p *Parser) expectPeek(t token.TokenType) bool { + if p.peekTokenIs(t) { + p.nextToken() + return true + } else { + p.peekError(t) + return false + } +} + +// Returns all parsing errors encountered +func (p *Parser) Errors() []string { + return p.errors +} + +// Appends to errors property of the Parser Instance when the nextToken +// is not what is expected. +func (p *Parser) peekError(t token.TokenType) { + msg := fmt.Sprintf("expected next token to be %s, got %s instead", + t, p.peekToken.Type) + p.errors = append(p.errors, msg) +} diff --git a/src/parser/parser_test.go b/src/parser/parser_test.go new file mode 100644 index 0000000..21ad649 --- /dev/null +++ b/src/parser/parser_test.go @@ -0,0 +1,86 @@ +package parser + +// TODO: Mock / Stub out the Lexer + +import ( + "YARTBML/ast" + "YARTBML/lexer" + "testing" +) + +func TestLetStatements(t *testing.T) { + input := ` +let x = 5; +let y = 10; +let foobar = 838383; +` + l := lexer.New(input) + p := New(l) + + program := p.ParseProgram() + checkParserErrors(t, p) + + if program == nil { + t.Fatalf("ParseProgram() returned nil") + } + + if len(program.Statements) != 3 { + t.Fatalf("program.Statements does not contain 3 statements. got=%d", + len(program.Statements)) + } + + tests := []struct { + expectedIdentifier string + }{ + {"x"}, + {"y"}, + {"foobar"}, + } + + for i, tt := range tests { + stmt := program.Statements[i] + if !testLetStatement(t, stmt, tt.expectedIdentifier) { + return + } + } +} + +func checkParserErrors(t *testing.T, p *Parser) { + errors := p.Errors() + if len(errors) == 0 { + return + } + + t.Errorf("parser has %d errors", len(errors)) + for _, msg := range errors { + t.Errorf("parser error: %q", msg) + } + t.FailNow() +} + +func testLetStatement(t *testing.T, s ast.Statement, name string) bool { + if s.TokenLiteral() != "let" { + t.Errorf("s.TokenLiteral not 'let'. got=%q", s.TokenLiteral()) + return false + } + + letStmt, ok := s.(*ast.LetStatement) + if !ok { + t.Errorf("s not *ast.LetStatement. got=%T", s) + return false + } + + if letStmt.Name.Value != name { + t.Errorf("letStmt.Name.Value not '%s'. got=%s", + name, letStmt.Name.Value) + return false + } + + if letStmt.Name.TokenLiteral() != name { + t.Errorf("letStmt.Name.TokenLiteral() not '%s'. got=%s", + name, letStmt.Name.TokenLiteral()) + return false + } + + return true +} diff --git a/src/repl/repl.go b/src/repl/repl.go index 819d221..877ba45 100644 --- a/src/repl/repl.go +++ b/src/repl/repl.go @@ -25,4 +25,3 @@ func Start(in io.Reader, out io.Writer) { } } } - diff --git a/src/token/token.go b/src/token/token.go index 919ce8d..50a8535 100644 --- a/src/token/token.go +++ b/src/token/token.go @@ -1,13 +1,22 @@ +// Package token defines constants representing the lexical tokens of the YARTBML Programming Language. +// These tokens are the smallest units in the language's syntax, such as identifiers, keywords, operators, and literals. +// The lexer (lexical analyzer) of the YARTBML interpreter uses these tokens to tokenize the source code input. +// YARTBML will only support UTF-8 and as such the Lexer and Tokens will only utilize literal values in UTF-8. +// Each token has a type and a literal value associated with it, if applicable. +// This package provides constants for all supported tokens and helper functions for working with them. package token +// TokenType represents the type of a token. type TokenType string +// Token holds the type and literal value of a token in UTF-8. type Token struct { Type TokenType Literal string } // Symbolic names substituted at complile time for the assigned value +// Each entry here is a valid token that is recognized in our language const ( ILLEGAL = "ILLEGAL" EOF = "EOF" @@ -49,6 +58,7 @@ const ( RETURN = "RETURN" ) +// Keywords maps identifiers to their corresponding token types. var keywords = map[string]TokenType{ "fn": FUNCTION, "let": LET, @@ -59,10 +69,10 @@ var keywords = map[string]TokenType{ "return": RETURN, } +// LookupIdent checks if the given identifier is a keyword. If it is, it returns the corresponding token type; otherwise, it returns IDENT. func LookupIdent(ident string) TokenType { if tok, ok := keywords[ident]; ok { return tok } return IDENT } -