From b4f356242edbb1671338627079ffc3fce5d98f77 Mon Sep 17 00:00:00 2001 From: H1ghBre4k3r Date: Wed, 20 Sep 2023 21:11:45 +0200 Subject: [PATCH] feat(lexer): improve lexer structure --- src/lexer/lexmap.rs | 28 ++++ src/{lexer.rs => lexer/mod.rs} | 255 ++++----------------------------- src/lexer/token.rs | 143 ++++++++++++++++++ src/lexer/tokens.rs | 50 +++++++ 4 files changed, 247 insertions(+), 229 deletions(-) create mode 100644 src/lexer/lexmap.rs rename src/{lexer.rs => lexer/mod.rs} (51%) create mode 100644 src/lexer/token.rs create mode 100644 src/lexer/tokens.rs diff --git a/src/lexer/lexmap.rs b/src/lexer/lexmap.rs new file mode 100644 index 0000000..a605ff6 --- /dev/null +++ b/src/lexer/lexmap.rs @@ -0,0 +1,28 @@ +use std::collections::HashMap; + +use super::Terminal; + +/// Struct for storing terminal symbols with their respective "key". +#[derive(Debug, Clone, Default)] +pub struct LexMap { + map: HashMap<&'static str, Terminal>, +} + +impl LexMap { + pub fn insert(&mut self, key: &'static str, value: Terminal) { + self.map.insert(key, value); + } + + pub fn can_match(&self, key: &str) -> bool { + for map_key in self.map.keys() { + if map_key.starts_with(key) { + return true; + } + } + false + } + + pub fn get(&self, key: &str) -> Option { + self.map.get(key).cloned() + } +} diff --git a/src/lexer.rs b/src/lexer/mod.rs similarity index 51% rename from src/lexer.rs rename to src/lexer/mod.rs index 931236a..a4648da 100644 --- a/src/lexer.rs +++ b/src/lexer/mod.rs @@ -1,229 +1,13 @@ -use lazy_static::lazy_static; -use pesca_parser_derive::Token as ParseToken; -use std::{collections::HashMap, error::Error, fmt::Display, iter::Peekable, str::Chars}; - -type Position = (usize, usize); - -#[derive(Debug, Clone, ParseToken)] -pub enum Token { - #[terminal] - Eq { - position: Position, - }, - #[terminal] - Let { - position: Position, - }, - Id { - value: String, - position: Position, - }, - Num { - value: u64, - position: Position, - }, - #[terminal] - Semicolon { - position: Position, - }, - // TODO: think about lexing comments - Comment { - value: String, - position: Position, - }, - #[terminal] - Plus { - position: Position, - }, - #[terminal] - Times { - position: Position, - }, - #[terminal] - LParen { - position: Position, - }, - #[terminal] - RParen { - position: Position, - }, - #[terminal] - LBrace { - position: Position, - }, - #[terminal] - RBrace { - position: Position, - }, - #[terminal] - FnKeyword { - position: Position, - }, - #[terminal] - ReturnKeyword { - position: Position, - }, - #[terminal] - Colon { - position: Position, - }, - #[terminal] - Comma { - position: Position, - }, -} - -impl Terminal { - pub fn to_token(&self, position: Position) -> Token { - match self { - Terminal::Eq => Token::Eq { position }, - Terminal::Let => Token::Let { position }, - Terminal::Semicolon => Token::Semicolon { position }, - Terminal::Plus => Token::Plus { position }, - Terminal::Times => Token::Times { position }, - Terminal::LParen => Token::LParen { position }, - Terminal::RParen => Token::RParen { position }, - Terminal::LBrace => Token::LBrace { position }, - Terminal::RBrace => Token::RBrace { position }, - Terminal::FnKeyword => Token::FnKeyword { position }, - Terminal::ReturnKeyword => Token::ReturnKeyword { position }, - Terminal::Colon => Token::Colon { position }, - Terminal::Comma => Token::Comma { position }, - } - } -} - -// TODO: move this to own derive macro -impl PartialEq for Token { - fn eq(&self, other: &Self) -> bool { - use Token::*; - matches!( - (self, other), - (Eq { .. }, Eq { .. }) - | (Let { .. }, Let { .. }) - | (Id { .. }, Id { .. }) - | (Num { .. }, Num { .. }) - | (Semicolon { .. }, Semicolon { .. }) - | (Comment { .. }, Comment { .. }) - | (Plus { .. }, Plus { .. }) - | (Times { .. }, Times { .. }) - | (LParen { .. }, LParen { .. }) - | (RParen { .. }, RParen { .. }) - | (LBrace { .. }, LBrace { .. }) - | (RBrace { .. }, RBrace { .. }) - | (FnKeyword { .. }, FnKeyword { .. }) - | (ReturnKeyword { .. }, ReturnKeyword { .. }) - | (Colon { .. }, Colon { .. }) - | (Comma { .. }, Comma { .. }) - ) - } -} - -impl Eq for Token {} - -impl Token { - pub fn position(&self) -> Position { - match self { - Token::Eq { position } => *position, - Token::Let { position } => *position, - Token::Id { position, .. } => *position, - Token::Num { position, .. } => *position, - Token::Semicolon { position } => *position, - Token::Comment { position, .. } => *position, - Token::Plus { position } => *position, - Token::Times { position } => *position, - Token::LParen { position } => *position, - Token::RParen { position } => *position, - Token::LBrace { position } => *position, - Token::RBrace { position } => *position, - Token::FnKeyword { position } => *position, - Token::ReturnKeyword { position } => *position, - Token::Colon { position } => *position, - Token::Comma { position } => *position, - } - } -} - -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct Tokens { - tokens: Vec, - index: usize, -} - -impl Tokens -where - T: Clone, -{ - pub fn new(tokens: Vec) -> Self { - Self { tokens, index: 0 } - } - - pub fn next(&mut self) -> Option { - if self.index < self.tokens.len() { - let item = self.tokens.get(self.index).cloned(); - self.index += 1; - return item; - } - - None - } - - pub fn peek(&mut self) -> Option { - return self.tokens.get(self.index).cloned(); - } - - pub fn get_index(&self) -> usize { - self.index - } - - pub fn set_index(&mut self, index: usize) { - self.index = index; - } -} - -impl From> for Tokens -where - T: Clone, -{ - fn from(value: Vec) -> Self { - Self::new(value) - } -} +mod lexmap; +mod token; +mod tokens; -#[derive(Debug, Clone, PartialEq, Eq)] -pub struct LexError(String); +pub use lexmap::*; +pub use token::*; +pub use tokens::*; -impl Display for LexError { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - f.write_str(self.0.as_str()) - } -} - -impl Error for LexError {} - -#[derive(Debug, Clone, Default)] -struct LexMap { - map: HashMap<&'static str, Terminal>, -} - -impl LexMap { - pub fn insert(&mut self, key: &'static str, value: Terminal) { - self.map.insert(key, value); - } - - pub fn can_match(&self, key: &str) -> bool { - for map_key in self.map.keys() { - if map_key.starts_with(key) { - return true; - } - } - false - } - - pub fn get(&self, key: &str) -> Option { - self.map.get(key).cloned() - } -} +use lazy_static::lazy_static; +use std::{error::Error, fmt::Display, iter::Peekable, str::Chars}; #[macro_export] macro_rules! terminal { @@ -254,6 +38,19 @@ lazy_static! { }; } +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct LexError(String); + +pub type LexResult = Result; + +impl Display for LexError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.0.as_str()) + } +} + +impl Error for LexError {} + #[derive(Debug, Clone)] pub struct Lexer<'a> { tokens: Vec, @@ -299,13 +96,13 @@ impl<'a> Lexer<'a> { } } - pub fn lex(mut self) -> Result, LexError> { + pub fn lex(mut self) -> LexResult> { self.lex_internal()?; Ok(self.tokens) } - pub fn lex_internal(&mut self) -> Result<(), LexError> { + pub fn lex_internal(&mut self) -> LexResult<()> { self.eat_whitespace(); let Some(next) = self.peek() else { @@ -321,7 +118,7 @@ impl<'a> Lexer<'a> { Ok(()) } - fn lex_special(&mut self) -> Result<(), LexError> { + fn lex_special(&mut self) -> LexResult<()> { let mut stack = vec![]; let position = (self.line, self.col); @@ -357,7 +154,7 @@ impl<'a> Lexer<'a> { self.lex_internal() } - fn lex_alphanumeric(&mut self) -> Result<(), LexError> { + fn lex_alphanumeric(&mut self) -> LexResult<()> { let mut stack = vec![]; let position = (self.line, self.col); @@ -381,7 +178,7 @@ impl<'a> Lexer<'a> { self.lex_internal() } - fn lex_numeric(&mut self) -> Result<(), LexError> { + fn lex_numeric(&mut self) -> LexResult<()> { let mut stack = vec![]; let position = (self.line, self.col); diff --git a/src/lexer/token.rs b/src/lexer/token.rs new file mode 100644 index 0000000..b5344cc --- /dev/null +++ b/src/lexer/token.rs @@ -0,0 +1,143 @@ +use pesca_parser_derive::Token as ParseToken; + +type Position = (usize, usize); + +#[derive(Debug, Clone, ParseToken)] +pub enum Token { + #[terminal] + Eq { + position: Position, + }, + #[terminal] + Let { + position: Position, + }, + Id { + value: String, + position: Position, + }, + Num { + value: u64, + position: Position, + }, + #[terminal] + Semicolon { + position: Position, + }, + // TODO: think about lexing comments + Comment { + value: String, + position: Position, + }, + #[terminal] + Plus { + position: Position, + }, + #[terminal] + Times { + position: Position, + }, + #[terminal] + LParen { + position: Position, + }, + #[terminal] + RParen { + position: Position, + }, + #[terminal] + LBrace { + position: Position, + }, + #[terminal] + RBrace { + position: Position, + }, + #[terminal] + FnKeyword { + position: Position, + }, + #[terminal] + ReturnKeyword { + position: Position, + }, + #[terminal] + Colon { + position: Position, + }, + #[terminal] + Comma { + position: Position, + }, +} + +impl Terminal { + pub fn to_token(&self, position: Position) -> Token { + match self { + Terminal::Eq => Token::Eq { position }, + Terminal::Let => Token::Let { position }, + Terminal::Semicolon => Token::Semicolon { position }, + Terminal::Plus => Token::Plus { position }, + Terminal::Times => Token::Times { position }, + Terminal::LParen => Token::LParen { position }, + Terminal::RParen => Token::RParen { position }, + Terminal::LBrace => Token::LBrace { position }, + Terminal::RBrace => Token::RBrace { position }, + Terminal::FnKeyword => Token::FnKeyword { position }, + Terminal::ReturnKeyword => Token::ReturnKeyword { position }, + Terminal::Colon => Token::Colon { position }, + Terminal::Comma => Token::Comma { position }, + } + } +} + +// TODO: move this to own derive macro +impl PartialEq for Token { + fn eq(&self, other: &Self) -> bool { + use Token::*; + matches!( + (self, other), + (Eq { .. }, Eq { .. }) + | (Let { .. }, Let { .. }) + | (Id { .. }, Id { .. }) + | (Num { .. }, Num { .. }) + | (Semicolon { .. }, Semicolon { .. }) + | (Comment { .. }, Comment { .. }) + | (Plus { .. }, Plus { .. }) + | (Times { .. }, Times { .. }) + | (LParen { .. }, LParen { .. }) + | (RParen { .. }, RParen { .. }) + | (LBrace { .. }, LBrace { .. }) + | (RBrace { .. }, RBrace { .. }) + | (FnKeyword { .. }, FnKeyword { .. }) + | (ReturnKeyword { .. }, ReturnKeyword { .. }) + | (Colon { .. }, Colon { .. }) + | (Comma { .. }, Comma { .. }) + ) + } +} + +impl Eq for Token {} + +impl Token { + pub fn position(&self) -> Position { + match self { + Token::Eq { position } => *position, + Token::Let { position } => *position, + Token::Id { position, .. } => *position, + Token::Num { position, .. } => *position, + Token::Semicolon { position } => *position, + Token::Comment { position, .. } => *position, + Token::Plus { position } => *position, + Token::Times { position } => *position, + Token::LParen { position } => *position, + Token::RParen { position } => *position, + Token::LBrace { position } => *position, + Token::RBrace { position } => *position, + Token::FnKeyword { position } => *position, + Token::ReturnKeyword { position } => *position, + Token::Colon { position } => *position, + Token::Comma { position } => *position, + } + } +} diff --git a/src/lexer/tokens.rs b/src/lexer/tokens.rs new file mode 100644 index 0000000..cfea104 --- /dev/null +++ b/src/lexer/tokens.rs @@ -0,0 +1,50 @@ +/// Struct for iterating over a vector of tokens. +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct Tokens { + tokens: Vec, + index: usize, +} + +impl Tokens +where + T: Clone, +{ + pub fn new(tokens: Vec) -> Self { + Self { tokens, index: 0 } + } + + /// Get the next item (if present). + pub fn next(&mut self) -> Option { + if self.index < self.tokens.len() { + let item = self.tokens.get(self.index).cloned(); + self.index += 1; + return item; + } + + None + } + + /// Peek at the next item. + pub fn peek(&mut self) -> Option { + return self.tokens.get(self.index).cloned(); + } + + /// Get the current index. + pub fn get_index(&self) -> usize { + self.index + } + + /// Set the index of this "iterator". + pub fn set_index(&mut self, index: usize) { + self.index = index; + } +} + +impl From> for Tokens +where + T: Clone, +{ + fn from(value: Vec) -> Self { + Self::new(value) + } +}