Skip to content

Commit

Permalink
ginkgo: correctly lex UTF-8 in identifiers
Browse files Browse the repository at this point in the history
  • Loading branch information
IsaacWoods committed Nov 14, 2024
1 parent d227cf2 commit 31fd218
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 8 deletions.
9 changes: 8 additions & 1 deletion ginkgo/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions ginkgo/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ required-features = ["rustyline"]
[dependencies]
std = { path = "../lib/std", optional = true }
rustyline = { version = "14.0.0", features = ["derive"], optional = true }
unicode-xid = "0.2.6"

[features]
default = ["rustyline"]
Expand Down
79 changes: 73 additions & 6 deletions ginkgo/src/lex.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
use std::str::Chars;

use unicode_xid::UnicodeXID;

#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Debug)]
pub enum TokenType {
/*
Expand Down Expand Up @@ -75,8 +77,6 @@ pub enum TokenValue<'s> {

pub struct Lex<'s> {
source: &'s str,
// TODO: I wonder if we, in a UTF8-aware world, want to iterate over grapheme clusters instead
// (this may well be a fair bit slower??)
stream: PeekingIter<Chars<'s>>,
offset: usize,

Expand Down Expand Up @@ -106,7 +106,7 @@ impl<'s> Lex<'s> {
pub fn advance(&mut self) -> Option<char> {
let c = self.stream.next()?;
self.offset += 1;
self.current_length += 1;
self.current_length += c.len_utf8();
Some(c)
}

Expand Down Expand Up @@ -185,7 +185,7 @@ impl<'s> Iterator for Lex<'s> {
_ => return Some(self.produce(TokenType::Pipe)),
},

// TODO: parse comments, both line and block here
// TODO: parse comments, both line and block here, including handling of nested comments
'/' => return Some(self.produce(TokenType::Slash)),

/*
Expand All @@ -199,6 +199,8 @@ impl<'s> Iterator for Lex<'s> {
c if c.is_digit(10) => {
// TODO: parse hex
// TODO: support octal?
// TODO: scientfic notation
// TODO: separation with underscore between digits

while self.stream.peek().map_or(false, |c| c.is_digit(10)) {
self.advance();
Expand Down Expand Up @@ -239,12 +241,13 @@ impl<'s> Iterator for Lex<'s> {
/*
* Parse keywords and identifiers.
*/
c if c.is_alphanumeric() => {
// TODO: identifiers should be able to start with underscores, but an underscore on its own should not be lexed as an identifier
c if c.is_xid_start() || c == '_' => {
/*
* Do a maximal munch to make sure identifiers that start with reserved
* keywords are not mistaken for those keywords.
*/
while self.stream.peek().map_or(false, |c| c.is_alphanumeric() || c == '_') {
while self.stream.peek().map_or(false, |c| c.is_xid_continue()) {
self.advance()?;
}

Expand Down Expand Up @@ -335,3 +338,67 @@ where
}
}
}

#[cfg(test)]
mod tests {
use super::*;

fn test_tokens(source: &str, tokens: &[TokenType]) {
let mut lex = Lex::new(source);

for token_to_match in tokens.into_iter() {
match lex.next() {
Some(token) if token.typ == *token_to_match => (),
Some(other) => panic!("Got wrong type of token: {:?} => {:?}", other, lex.token_value(other)),
None => panic!(),
}
}
}

#[test]
fn keywords() {
test_tokens(
"let if else for loop while true false return fn class self",
&[
TokenType::Let,
TokenType::If,
TokenType::Else,
TokenType::For,
TokenType::Loop,
TokenType::While,
TokenType::True,
TokenType::False,
TokenType::Return,
TokenType::Fn,
TokenType::Class,
TokenType::GinkgoSelf,
],
);
}

#[test]
fn identifiers() {
fn test_identifier(ident: &str) {
let mut lex = Lex::new(ident);
let token = lex.next().expect("Failed to lex identifier correctly!");
assert!(lex.next().is_none());

match lex.token_value(token) {
Some(TokenValue::Identifier(lexed_ident)) => assert_eq!(ident, lexed_ident),
_ => panic!("Failed to lex identifier correctly!"),
}
}

test_identifier("foo");
test_identifier("bar73");
test_identifier("with_some_underscores");
test_identifier("do_n0t_nam3_th1ngs_l1k3_thi5");
test_identifier("Москва");
test_identifier("東京");
}

// TODO: test numbers - hex literals, octal literals, binary literals, scientific notation, underscore separation
// TODO: test strings
// TODO: test operators
// TODO: test comments
}
9 changes: 8 additions & 1 deletion user/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 31fd218

Please sign in to comment.