diff --git a/src/tokenizer.rs b/src/tokenizer.rs index bbad1a4c4..44dfea3dd 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -253,6 +253,31 @@ impl fmt::Display for Whitespace { } } +/// Location in input string +#[derive(Debug, PartialEq, Clone)] +pub struct Location { + /// Line number, starting from 1 + pub line: u64, + /// Line column, starting from 1 + pub column: u64, +} + +/// A [Token] with [Location] attached to it +#[derive(Debug, PartialEq, Clone)] +pub struct TokenWithLocation { + pub token: Token, + pub location: Location, +} + +impl TokenWithLocation { + pub fn new(token: Token, line: u64, column: u64) -> TokenWithLocation { + TokenWithLocation { + token, + location: Location { line, column }, + } + } +} + /// Tokenizer error #[derive(Debug, PartialEq)] pub struct TokenizerError { @@ -261,78 +286,113 @@ pub struct TokenizerError { pub col: u64, } +struct State<'a> { + peekable: Peekable>, + pub line: u64, + pub col: u64, +} + +impl<'a> State<'a> { + pub fn next(&mut self) -> Option { + match self.peekable.next() { + None => None, + Some(s) => { + if s == '\n' { + self.line += 1; + self.col = 1; + } else { + self.col += 1; + } + Some(s) + } + } + } + + pub fn peek(&mut self) -> Option<&char> { + self.peekable.peek() + } + + pub fn location(&self) -> Location { + Location { + line: self.line, + column: self.col, + } + } +} + /// SQL Tokenizer pub struct Tokenizer<'a> { dialect: &'a dyn Dialect, - pub query: String, - pub line: u64, - pub col: u64, + pub query: &'a str, } impl<'a> Tokenizer<'a> { /// Create a new SQL tokenizer for the specified SQL statement - pub fn new(dialect: &'a dyn Dialect, query: &str) -> Self { - Self { - dialect, - query: query.to_string(), - line: 1, - col: 1, - } + pub fn new(dialect: &'a dyn Dialect, query: &'a str) -> Self { + Self { dialect, query } } /// Tokenize the statement and produce a vector of tokens pub fn tokenize(&mut self) -> Result, TokenizerError> { - let mut peekable = self.query.chars().peekable(); - + let twl = self.tokenize_with_location()?; let mut tokens: Vec = vec![]; - while let Some(token) = self.next_token(&mut peekable)? { - match &token { - Token::Whitespace(Whitespace::Newline) => { - self.line += 1; - self.col = 1; - } + tokens.reserve(twl.len()); + for token_with_location in twl { + tokens.push(token_with_location.token); + } + Ok(tokens) + } - Token::Whitespace(Whitespace::Tab) => self.col += 4, - Token::Word(w) if w.quote_style == None => self.col += w.value.len() as u64, - Token::Word(w) if w.quote_style != None => self.col += w.value.len() as u64 + 2, - Token::Number(s) => self.col += s.len() as u64, - Token::SingleQuotedString(s) => self.col += s.len() as u64, - _ => self.col += 1, - } + /// Tokenize the statement and produce a vector of tokens with location information + pub fn tokenize_with_location(&mut self) -> Result, TokenizerError> { + let mut state = State { + peekable: self.query.chars().peekable(), + line: 1, + col: 1, + }; + + let mut tokens: Vec = vec![]; - tokens.push(token); + let mut location = state.location(); + while let Some(token) = self.next_token(&mut state)? { + tokens.push(TokenWithLocation { + token, + location: location.clone(), + }); + + location = state.location(); } Ok(tokens) } /// Get the next token or return None - fn next_token(&self, chars: &mut Peekable>) -> Result, TokenizerError> { + fn next_token(&self, state: &mut State) -> Result, TokenizerError> { //println!("next_token: {:?}", chars.peek()); - match chars.peek() { + match state.peek() { Some(&ch) => match ch { - ' ' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Space)), - '\t' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Tab)), - '\n' => self.consume_and_return(chars, Token::Whitespace(Whitespace::Newline)), + ' ' => self.consume_and_return(state, Token::Whitespace(Whitespace::Space)), + '\t' => self.consume_and_return(state, Token::Whitespace(Whitespace::Tab)), + '\n' => self.consume_and_return(state, Token::Whitespace(Whitespace::Newline)), '\r' => { // Emit a single Whitespace::Newline token for \r and \r\n - chars.next(); - if let Some('\n') = chars.peek() { - chars.next(); + state.next(); + if let Some('\n') = state.peek() { + state.next(); } Ok(Some(Token::Whitespace(Whitespace::Newline))) } 'N' => { - chars.next(); // consume, to check the next char - match chars.peek() { + state.next(); // consume, to check the next char + match state.peek() { Some('\'') => { // N'...' - a - let s = self.tokenize_single_quoted_string(chars)?; + let s = self.tokenize_single_quoted_string(state)?; Ok(Some(Token::NationalStringLiteral(s))) } _ => { // regular identifier starting with an "N" - let s = self.tokenize_word('N', chars); + let s = self.tokenize_word('N', state); Ok(Some(Token::make_word(&s, None))) } } @@ -340,40 +400,41 @@ impl<'a> Tokenizer<'a> { // The spec only allows an uppercase 'X' to introduce a hex // string, but PostgreSQL, at least, allows a lowercase 'x' too. x @ 'x' | x @ 'X' => { - chars.next(); // consume, to check the next char - match chars.peek() { + state.next(); // consume, to check the next char + match state.peek() { Some('\'') => { // X'...' - a - let s = self.tokenize_single_quoted_string(chars)?; + let s = self.tokenize_single_quoted_string(state)?; Ok(Some(Token::HexStringLiteral(s))) } _ => { // regular identifier starting with an "X" - let s = self.tokenize_word(x, chars); + let s = self.tokenize_word(x, state); Ok(Some(Token::make_word(&s, None))) } } } // identifier or keyword ch if self.dialect.is_identifier_start(ch) => { - chars.next(); // consume the first char - let s = self.tokenize_word(ch, chars); + state.next(); // consume the first char + let s = self.tokenize_word(ch, state); Ok(Some(Token::make_word(&s, None))) } // string '\'' => { - let s = self.tokenize_single_quoted_string(chars)?; + let s = self.tokenize_single_quoted_string(state)?; Ok(Some(Token::SingleQuotedString(s))) } // delimited (quoted) identifier quote_start if self.dialect.is_delimited_identifier_start(quote_start) => { - chars.next(); // consume the opening quote + state.next(); // consume the opening quote let quote_end = Word::matching_end_quote(quote_start); - let s = peeking_take_while(chars, |ch| ch != quote_end); - if chars.next() == Some(quote_end) { + let s = peeking_take_while(state, |ch| ch != quote_end); + if state.next() == Some(quote_end) { Ok(Some(Token::make_word(&s, Some(quote_start)))) } else { self.tokenizer_error( + state, format!("Expected close delimiter '{}' before EOF.", quote_end) .as_str(), ) @@ -382,20 +443,20 @@ impl<'a> Tokenizer<'a> { // numbers '0'..='9' => { // TODO: https://jakewheat.github.io/sql-overview/sql-2011-foundation-grammar.html#unsigned-numeric-literal - let s = peeking_take_while(chars, |ch| matches!(ch, '0'..='9' | '.')); + let s = peeking_take_while(state, |ch| matches!(ch, '0'..='9' | '.')); Ok(Some(Token::Number(s))) } // punctuation - '(' => self.consume_and_return(chars, Token::LParen), - ')' => self.consume_and_return(chars, Token::RParen), - ',' => self.consume_and_return(chars, Token::Comma), + '(' => self.consume_and_return(state, Token::LParen), + ')' => self.consume_and_return(state, Token::RParen), + ',' => self.consume_and_return(state, Token::Comma), // operators '-' => { - chars.next(); // consume the '-' - match chars.peek() { + state.next(); // consume the '-' + match state.peek() { Some('-') => { - chars.next(); // consume the second '-', starting a single-line comment - let comment = self.tokenize_single_line_comment(chars); + state.next(); // consume the second '-', starting a single-line comment + let comment = self.tokenize_single_line_comment(state); Ok(Some(Token::Whitespace(Whitespace::SingleLineComment { prefix: "--".to_owned(), comment, @@ -406,15 +467,15 @@ impl<'a> Tokenizer<'a> { } } '/' => { - chars.next(); // consume the '/' - match chars.peek() { + state.next(); // consume the '/' + match state.peek() { Some('*') => { - chars.next(); // consume the '*', starting a multi-line comment - self.tokenize_multiline_comment(chars) + state.next(); // consume the '*', starting a multi-line comment + self.tokenize_multiline_comment(state) } Some('/') if dialect_of!(self is SnowflakeDialect) => { - chars.next(); // consume the second '/', starting a snowflake single-line comment - let comment = self.tokenize_single_line_comment(chars); + state.next(); // consume the second '/', starting a snowflake single-line comment + let comment = self.tokenize_single_line_comment(state); Ok(Some(Token::Whitespace(Whitespace::SingleLineComment { prefix: "//".to_owned(), comment, @@ -424,17 +485,17 @@ impl<'a> Tokenizer<'a> { _ => Ok(Some(Token::Div)), } } - '+' => self.consume_and_return(chars, Token::Plus), - '*' => self.consume_and_return(chars, Token::Mult), - '%' => self.consume_and_return(chars, Token::Mod), + '+' => self.consume_and_return(state, Token::Plus), + '*' => self.consume_and_return(state, Token::Mult), + '%' => self.consume_and_return(state, Token::Mod), '|' => { - chars.next(); // consume the '|' - match chars.peek() { - Some('/') => self.consume_and_return(chars, Token::PGSquareRoot), + state.next(); // consume the '|' + match state.peek() { + Some('/') => self.consume_and_return(state, Token::PGSquareRoot), Some('|') => { - chars.next(); // consume the second '|' - match chars.peek() { - Some('/') => self.consume_and_return(chars, Token::PGCubeRoot), + state.next(); // consume the second '|' + match state.peek() { + Some('/') => self.consume_and_return(state, Token::PGCubeRoot), _ => Ok(Some(Token::StringConcat)), } } @@ -443,82 +504,83 @@ impl<'a> Tokenizer<'a> { } } '=' => { - chars.next(); // consume - match chars.peek() { - Some('>') => self.consume_and_return(chars, Token::RArrow), + state.next(); // consume + match state.peek() { + Some('>') => self.consume_and_return(state, Token::RArrow), _ => Ok(Some(Token::Eq)), } } - '.' => self.consume_and_return(chars, Token::Period), + '.' => self.consume_and_return(state, Token::Period), '!' => { - chars.next(); // consume - match chars.peek() { - Some('=') => self.consume_and_return(chars, Token::Neq), - Some('!') => self.consume_and_return(chars, Token::DoubleExclamationMark), + state.next(); // consume + match state.peek() { + Some('=') => self.consume_and_return(state, Token::Neq), + Some('!') => self.consume_and_return(state, Token::DoubleExclamationMark), _ => Ok(Some(Token::ExclamationMark)), } } '<' => { - chars.next(); // consume - match chars.peek() { - Some('=') => self.consume_and_return(chars, Token::LtEq), - Some('>') => self.consume_and_return(chars, Token::Neq), - Some('<') => self.consume_and_return(chars, Token::ShiftLeft), + state.next(); // consume + match state.peek() { + Some('=') => self.consume_and_return(state, Token::LtEq), + Some('>') => self.consume_and_return(state, Token::Neq), + Some('<') => self.consume_and_return(state, Token::ShiftLeft), _ => Ok(Some(Token::Lt)), } } '>' => { - chars.next(); // consume - match chars.peek() { - Some('=') => self.consume_and_return(chars, Token::GtEq), - Some('>') => self.consume_and_return(chars, Token::ShiftRight), + state.next(); // consume + match state.peek() { + Some('=') => self.consume_and_return(state, Token::GtEq), + Some('>') => self.consume_and_return(state, Token::ShiftRight), _ => Ok(Some(Token::Gt)), } } ':' => { - chars.next(); - match chars.peek() { - Some(':') => self.consume_and_return(chars, Token::DoubleColon), + state.next(); + match state.peek() { + Some(':') => self.consume_and_return(state, Token::DoubleColon), _ => Ok(Some(Token::Colon)), } } - ';' => self.consume_and_return(chars, Token::SemiColon), - '\\' => self.consume_and_return(chars, Token::Backslash), - '[' => self.consume_and_return(chars, Token::LBracket), - ']' => self.consume_and_return(chars, Token::RBracket), - '&' => self.consume_and_return(chars, Token::Ampersand), - '^' => self.consume_and_return(chars, Token::Caret), - '{' => self.consume_and_return(chars, Token::LBrace), - '}' => self.consume_and_return(chars, Token::RBrace), + ';' => self.consume_and_return(state, Token::SemiColon), + '\\' => self.consume_and_return(state, Token::Backslash), + '[' => self.consume_and_return(state, Token::LBracket), + ']' => self.consume_and_return(state, Token::RBracket), + '&' => self.consume_and_return(state, Token::Ampersand), + '^' => self.consume_and_return(state, Token::Caret), + '{' => self.consume_and_return(state, Token::LBrace), + '}' => self.consume_and_return(state, Token::RBrace), '#' if dialect_of!(self is SnowflakeDialect) => { - chars.next(); // consume the '#', starting a snowflake single-line comment - let comment = self.tokenize_single_line_comment(chars); + state.next(); // consume the '#', starting a snowflake single-line comment + let comment = self.tokenize_single_line_comment(state); Ok(Some(Token::Whitespace(Whitespace::SingleLineComment { prefix: "#".to_owned(), comment, }))) } - '~' => self.consume_and_return(chars, Token::Tilde), - '#' => self.consume_and_return(chars, Token::Sharp), - '@' => self.consume_and_return(chars, Token::AtSign), - other => self.consume_and_return(chars, Token::Char(other)), + '~' => self.consume_and_return(state, Token::Tilde), + '#' => self.consume_and_return(state, Token::Sharp), + '@' => self.consume_and_return(state, Token::AtSign), + other => self.consume_and_return(state, Token::Char(other)), }, None => Ok(None), } } - fn tokenizer_error(&self, message: &str) -> Result { + fn tokenizer_error(&self, state: &State, message: &str) -> Result { + let loc = state.location(); Err(TokenizerError { message: message.to_string(), - col: self.col, - line: self.line, + col: loc.column, + line: loc.line, }) } // Consume characters until newline - fn tokenize_single_line_comment(&self, chars: &mut Peekable>) -> String { - let mut comment = peeking_take_while(chars, |ch| ch != '\n'); - if let Some(ch) = chars.next() { + fn tokenize_single_line_comment(&self, state: &mut State) -> String { + let mut comment = peeking_take_while(state, |ch| ch != '\n'); + if let Some(ch) = state.next() { assert_eq!(ch, '\n'); comment.push(ch); } @@ -526,51 +588,48 @@ impl<'a> Tokenizer<'a> { } /// Tokenize an identifier or keyword, after the first char is already consumed. - fn tokenize_word(&self, first_char: char, chars: &mut Peekable>) -> String { + fn tokenize_word(&self, first_char: char, state: &mut State) -> String { let mut s = first_char.to_string(); - s.push_str(&peeking_take_while(chars, |ch| { + s.push_str(&peeking_take_while(state, |ch| { self.dialect.is_identifier_part(ch) })); s } /// Read a single quoted string, starting with the opening quote. - fn tokenize_single_quoted_string( - &self, - chars: &mut Peekable>, - ) -> Result { + fn tokenize_single_quoted_string(&self, state: &mut State) -> Result { let mut s = String::new(); - chars.next(); // consume the opening quote - while let Some(&ch) = chars.peek() { + state.next(); // consume the opening quote + while let Some(&ch) = state.peek() { match ch { '\'' => { - chars.next(); // consume - let escaped_quote = chars.peek().map(|c| *c == '\'').unwrap_or(false); + state.next(); // consume + let escaped_quote = state.peek().map(|c| *c == '\'').unwrap_or(false); if escaped_quote { s.push('\''); - chars.next(); + state.next(); } else { return Ok(s); } } _ => { - chars.next(); // consume + state.next(); // consume s.push(ch); } } } - self.tokenizer_error("Unterminated string literal") + self.tokenizer_error(state, "Unterminated string literal") } fn tokenize_multiline_comment( &self, - chars: &mut Peekable>, + state: &mut State, ) -> Result, TokenizerError> { let mut s = String::new(); let mut maybe_closing_comment = false; // TODO: deal with nested comments loop { - match chars.next() { + match state.next() { Some(ch) => { if maybe_closing_comment { if ch == '/' { @@ -584,17 +643,20 @@ impl<'a> Tokenizer<'a> { s.push(ch); } } - None => break self.tokenizer_error("Unexpected EOF while in a multi-line comment"), + None => { + break self + .tokenizer_error(state, "Unexpected EOF while in a multi-line comment") + } } } } fn consume_and_return( &self, - chars: &mut Peekable>, + state: &mut State, t: Token, ) -> Result, TokenizerError> { - chars.next(); + state.next(); Ok(Some(t)) } } @@ -602,14 +664,11 @@ impl<'a> Tokenizer<'a> { /// Read from `chars` until `predicate` returns `false` or EOF is hit. /// Return the characters read as String, and keep the first non-matching /// char available as `chars.next()`. -fn peeking_take_while( - chars: &mut Peekable>, - mut predicate: impl FnMut(char) -> bool, -) -> String { +fn peeking_take_while(state: &mut State, mut predicate: impl FnMut(char) -> bool) -> String { let mut s = String::new(); - while let Some(&ch) = chars.peek() { + while let Some(&ch) = state.peek() { if predicate(ch) { - chars.next(); // consume + state.next(); // consume s.push(ch); } else { break; @@ -866,7 +925,7 @@ mod tests { Err(TokenizerError { message: "Unterminated string literal".to_string(), line: 1, - col: 8 + col: 12 }) ); } @@ -1010,7 +1069,7 @@ mod tests { Err(TokenizerError { message: "Expected close delimiter '\"' before EOF.".to_string(), line: 1, - col: 1 + col: 5, }) ); } @@ -1064,4 +1123,27 @@ mod tests { //println!("------------------------------"); assert_eq!(expected, actual); } + + fn compare_with_location(expected: Vec, actual: Vec) { + //println!("------------------------------"); + //println!("tokens = {:?}", actual); + //println!("expected = {:?}", expected); + //println!("------------------------------"); + assert_eq!(expected, actual); + } + + #[test] + fn tokenize_location_multiline_string_literal() { + let sql = String::from("'some\nthing' foo"); + + let dialect = GenericDialect {}; + let mut tokenizer = Tokenizer::new(&dialect, &sql); + let tokens = tokenizer.tokenize_with_location().unwrap(); + let expected = vec![ + TokenWithLocation::new(Token::SingleQuotedString("some\nthing".to_string()), 1, 1), + TokenWithLocation::new(Token::Whitespace(Whitespace::Space), 2, 7), + TokenWithLocation::new(Token::make_word("foo", None), 2, 8), + ]; + compare_with_location(expected, tokens); + } }