diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 13bce0c0d..b1427e3d2 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1186,8 +1186,8 @@ impl<'a> Tokenizer<'a> { Ok(Some(Token::make_word(&word.concat(), Some(quote_start)))) } - // numbers and period - '0'..='9' | '.' => { + // Numbers + '0'..='9' => { // Some dialects support underscore as number separator // There can only be one at a time and it must be followed by another digit let is_number_separator = |ch: char, next_char: Option| { @@ -1196,11 +1196,12 @@ impl<'a> Tokenizer<'a> { && next_char.is_some_and(|next_ch| next_ch.is_ascii_hexdigit()) }; + // Start with number or potential separator let mut s = peeking_next_take_while(chars, |ch, next_ch| { ch.is_ascii_digit() || is_number_separator(ch, next_ch) }); - // match binary literal that starts with 0x + // Match binary literal that starts with 0x if s == "0" && chars.peek() == Some(&'x') { chars.next(); let s2 = peeking_next_take_while(chars, |ch, next_ch| { @@ -1209,51 +1210,34 @@ impl<'a> Tokenizer<'a> { return Ok(Some(Token::HexStringLiteral(s2))); } - // match one period + // Match fractional part after a dot if let Some('.') = chars.peek() { s.push('.'); chars.next(); } - // If the dialect supports identifiers that start with a numeric prefix - // and we have now consumed a dot, check if the previous token was a Word. - // If so, what follows is definitely not part of a decimal number and - // we should yield the dot as a dedicated token so compound identifiers - // starting with digits can be parsed correctly. - if s == "." && self.dialect.supports_numeric_prefix() { - if let Some(Token::Word(_)) = prev_token { - return Ok(Some(Token::Period)); - } - } - - // Consume fractional digits. + // Consume fractional digits s += &peeking_next_take_while(chars, |ch, next_ch| { ch.is_ascii_digit() || is_number_separator(ch, next_ch) }); - // No fraction -> Token::Period - if s == "." { - return Ok(Some(Token::Period)); - } - - // Parse exponent as number + // Parse exponent part (e.g., e+10 or E-5) let mut exponent_part = String::new(); if chars.peek() == Some(&'e') || chars.peek() == Some(&'E') { let mut char_clone = chars.peekable.clone(); - exponent_part.push(char_clone.next().unwrap()); + exponent_part.push(char_clone.next().unwrap()); // consume 'e' or 'E' // Optional sign - match char_clone.peek() { - Some(&c) if matches!(c, '+' | '-') => { + if let Some(&c) = char_clone.peek() { + if c == '+' || c == '-' { exponent_part.push(c); char_clone.next(); } - _ => (), } - match char_clone.peek() { - // Definitely an exponent, get original iterator up to speed and use it - Some(&c) if c.is_ascii_digit() => { + // Parse digits after the exponent + if let Some(&c) = char_clone.peek() { + if c.is_ascii_digit() { for _ in 0..exponent_part.len() { chars.next(); } @@ -1261,8 +1245,6 @@ impl<'a> Tokenizer<'a> { &peeking_take_while(chars, |ch| ch.is_ascii_digit()); s += exponent_part.as_str(); } - // Not an exponent, discard the work done - _ => (), } } @@ -1271,8 +1253,7 @@ impl<'a> Tokenizer<'a> { // be tokenized as a word. if self.dialect.supports_numeric_prefix() { if exponent_part.is_empty() { - // If it is not a number with an exponent, it may be - // an identifier starting with digits. + // Handle as potential word if no exponent part let word = peeking_take_while(chars, |ch| self.dialect.is_identifier_part(ch)); @@ -1281,20 +1262,91 @@ impl<'a> Tokenizer<'a> { return Ok(Some(Token::make_word(s.as_str(), None))); } } else if prev_token == Some(&Token::Period) { - // If the previous token was a period, thus not belonging to a number, - // the value we have is part of an identifier. + // Handle as word if it follows a period return Ok(Some(Token::make_word(s.as_str(), None))); } } + // Handle "L" suffix for long numbers let long = if chars.peek() == Some(&'L') { chars.next(); true } else { false }; + + // Return the final token for the number Ok(Some(Token::Number(s, long))) } + + // Period (`.`) handling + '.' => { + chars.next(); // consume the dot + + match chars.peek() { + // Handle "._" case as a period followed by identifier + // if the last token was a word + Some('_') if matches!(prev_token, Some(Token::Word(_))) => { + Ok(Some(Token::Period)) + } + Some('_') => { + self.tokenizer_error( + chars.location(), + "Unexpected underscore here".to_string(), + ) + } + Some(ch) + // Hive and mysql dialects allow numeric prefixes for identifers + if ch.is_ascii_digit() + && self.dialect.supports_numeric_prefix() + && matches!(prev_token, Some(Token::Word(_))) => + { + Ok(Some(Token::Period)) + } + Some(ch) if ch.is_ascii_digit() => { + // Handle numbers starting with a dot (e.g., ".123") + let mut s = String::from("."); + let is_number_separator = |ch: char, next_char: Option| { + self.dialect.supports_numeric_literal_underscores() + && ch == '_' + && next_char.is_some_and(|c| c.is_ascii_digit()) + }; + + s += &peeking_next_take_while(chars, |ch, next_ch| { + ch.is_ascii_digit() || is_number_separator(ch, next_ch) + }); + + // Handle exponent part + if matches!(chars.peek(), Some('e' | 'E')) { + let mut exp = String::new(); + exp.push(chars.next().unwrap()); + + if matches!(chars.peek(), Some('+' | '-')) { + exp.push(chars.next().unwrap()); + } + + if matches!(chars.peek(), Some(c) if c.is_ascii_digit()) { + exp += &peeking_take_while(chars, |c| c.is_ascii_digit()); + s += &exp; + } + } + + // Handle "L" suffix for long numbers + let long = if chars.peek() == Some(&'L') { + chars.next(); + true + } else { + false + }; + + Ok(Some(Token::Number(s, long))) + } + _ => { + // Just a plain period + Ok(Some(Token::Period)) + } + } + } // punctuation '(' => self.consume_and_return(chars, Token::LParen), ')' => self.consume_and_return(chars, Token::RParen), @@ -2429,6 +2481,42 @@ mod tests { compare(expected, tokens); } + #[test] + fn tokenize_period_underscore() { + let sql = String::from("SELECT table._col"); + // a dialect that supports underscores in numeric literals + let dialect = PostgreSqlDialect {}; + let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap(); + + let expected = vec![ + Token::make_keyword("SELECT"), + Token::Whitespace(Whitespace::Space), + Token::Word(Word { + value: "table".to_string(), + quote_style: None, + keyword: Keyword::TABLE, + }), + Token::Period, + Token::Word(Word { + value: "_col".to_string(), + quote_style: None, + keyword: Keyword::NoKeyword, + }), + ]; + + compare(expected, tokens); + + let sql = String::from("SELECT ._123"); + if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() { + panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); + } + + let sql = String::from("SELECT ._abc"); + if let Ok(tokens) = Tokenizer::new(&dialect, &sql).tokenize() { + panic!("Tokenizer should have failed on {sql}, but it succeeded with {tokens:?}"); + } + } + #[test] fn tokenize_select_float() { let sql = String::from("SELECT .1");