@@ -743,8 +743,12 @@ impl std::error::Error for TokenizerError {}
743743
744744struct  State < ' a >  { 
745745    peekable :  Peekable < Chars < ' a > > , 
746+     /// Reference to the original source string being tokenized 
747+      source :  & ' a  str , 
746748    pub  line :  u64 , 
747749    pub  col :  u64 , 
750+     /// Byte position in the source string 
751+      pub  byte_pos :  usize , 
748752} 
749753
750754impl  State < ' _ >  { 
@@ -759,6 +763,8 @@ impl State<'_> {
759763                }  else  { 
760764                    self . col  += 1 ; 
761765                } 
766+                 // Update byte position (characters can be multi-byte in UTF-8) 
767+                 self . byte_pos  += s. len_utf8 ( ) ; 
762768                Some ( s) 
763769            } 
764770        } 
@@ -769,6 +775,12 @@ impl State<'_> {
769775        self . peekable . peek ( ) 
770776    } 
771777
778+     /// return the character after the next character (lookahead by 2) without advancing the stream 
779+      pub  fn  peek_next ( & self )  -> Option < char >  { 
780+         // Use the source and byte_pos instead of cloning the peekable iterator 
781+         self . source [ self . byte_pos ..] . chars ( ) . nth ( 1 ) 
782+     } 
783+ 
772784    pub  fn  location ( & self )  -> Location  { 
773785        Location  { 
774786            line :  self . line , 
@@ -893,8 +905,10 @@ impl<'a> Tokenizer<'a> {
893905    )  -> Result < ( ) ,  TokenizerError >  { 
894906        let  mut  state = State  { 
895907            peekable :  self . query . chars ( ) . peekable ( ) , 
908+             source :  self . query , 
896909            line :  1 , 
897910            col :  1 , 
911+             byte_pos :  0 , 
898912        } ; 
899913
900914        let  mut  location = state. location ( ) ; 
@@ -912,18 +926,21 @@ impl<'a> Tokenizer<'a> {
912926    fn  tokenize_identifier_or_keyword ( 
913927        & self , 
914928        ch :  impl  IntoIterator < Item  = char > , 
915-         chars :  & mut  State , 
929+         chars :  & mut  State < ' a > , 
916930    )  -> Result < Option < Token > ,  TokenizerError >  { 
917931        chars. next ( ) ;  // consume the first char 
918-         let  ch:  String  = ch. into_iter ( ) . collect ( ) ; 
919-         let  word = self . tokenize_word ( ch,  chars) ; 
932+                       // Calculate total byte length without allocating a String 
933+         let  consumed_byte_len:  usize  = ch. into_iter ( ) . map ( |c| c. len_utf8 ( ) ) . sum ( ) ; 
934+         let  word = self . tokenize_word ( consumed_byte_len,  chars) ; 
920935
921936        // TODO: implement parsing of exponent here 
922937        if  word. chars ( ) . all ( |x| x. is_ascii_digit ( )  || x == '.' )  { 
923938            let  mut  inner_state = State  { 
924939                peekable :  word. chars ( ) . peekable ( ) , 
940+                 source :  & word, 
925941                line :  0 , 
926942                col :  0 , 
943+                 byte_pos :  0 , 
927944            } ; 
928945            let  mut  s = peeking_take_while ( & mut  inner_state,  |ch| matches ! ( ch,  '0' ..='9'  | '.' ) ) ; 
929946            let  s2 = peeking_take_while ( chars,  |ch| matches ! ( ch,  '0' ..='9'  | '.' ) ) ; 
@@ -937,7 +954,7 @@ impl<'a> Tokenizer<'a> {
937954    /// Get the next token or return None 
938955     fn  next_token ( 
939956        & self , 
940-         chars :  & mut  State , 
957+         chars :  & mut  State < ' a > , 
941958        prev_token :  Option < & Token > , 
942959    )  -> Result < Option < Token > ,  TokenizerError >  { 
943960        match  chars. peek ( )  { 
@@ -988,7 +1005,7 @@ impl<'a> Tokenizer<'a> {
9881005                        } 
9891006                        _ => { 
9901007                            // regular identifier starting with an "b" or "B" 
991-                             let  s = self . tokenize_word ( b,  chars) ; 
1008+                             let  s = self . tokenize_word ( b. len_utf8 ( ) ,  chars) ; 
9921009                            Ok ( Some ( Token :: make_word ( & s,  None ) ) ) 
9931010                        } 
9941011                    } 
@@ -1015,7 +1032,7 @@ impl<'a> Tokenizer<'a> {
10151032                            ) , 
10161033                        _ => { 
10171034                            // regular identifier starting with an "r" or "R" 
1018-                             let  s = self . tokenize_word ( b,  chars) ; 
1035+                             let  s = self . tokenize_word ( b. len_utf8 ( ) ,  chars) ; 
10191036                            Ok ( Some ( Token :: make_word ( & s,  None ) ) ) 
10201037                        } 
10211038                    } 
@@ -1034,7 +1051,7 @@ impl<'a> Tokenizer<'a> {
10341051                        } 
10351052                        _ => { 
10361053                            // regular identifier starting with an "N" 
1037-                             let  s = self . tokenize_word ( n,  chars) ; 
1054+                             let  s = self . tokenize_word ( n. len_utf8 ( ) ,  chars) ; 
10381055                            Ok ( Some ( Token :: make_word ( & s,  None ) ) ) 
10391056                        } 
10401057                    } 
@@ -1051,7 +1068,7 @@ impl<'a> Tokenizer<'a> {
10511068                        } 
10521069                        _ => { 
10531070                            // regular identifier starting with an "E" or "e" 
1054-                             let  s = self . tokenize_word ( x,  chars) ; 
1071+                             let  s = self . tokenize_word ( x. len_utf8 ( ) ,  chars) ; 
10551072                            Ok ( Some ( Token :: make_word ( & s,  None ) ) ) 
10561073                        } 
10571074                    } 
@@ -1070,7 +1087,7 @@ impl<'a> Tokenizer<'a> {
10701087                        } 
10711088                    } 
10721089                    // regular identifier starting with an "U" or "u" 
1073-                     let  s = self . tokenize_word ( x,  chars) ; 
1090+                     let  s = self . tokenize_word ( x. len_utf8 ( ) ,  chars) ; 
10741091                    Ok ( Some ( Token :: make_word ( & s,  None ) ) ) 
10751092                } 
10761093                // The spec only allows an uppercase 'X' to introduce a hex 
@@ -1085,7 +1102,7 @@ impl<'a> Tokenizer<'a> {
10851102                        } 
10861103                        _ => { 
10871104                            // regular identifier starting with an "X" 
1088-                             let  s = self . tokenize_word ( x,  chars) ; 
1105+                             let  s = self . tokenize_word ( x. len_utf8 ( ) ,  chars) ; 
10891106                            Ok ( Some ( Token :: make_word ( & s,  None ) ) ) 
10901107                        } 
10911108                    } 
@@ -1876,13 +1893,26 @@ impl<'a> Tokenizer<'a> {
18761893        comment
18771894    } 
18781895
1879-     /// Tokenize an identifier or keyword, after the first char is already consumed. 
1880-      fn  tokenize_word ( & self ,  first_chars :  impl  Into < String > ,  chars :  & mut  State )  -> String  { 
1881-         let  mut  s = first_chars. into ( ) ; 
1882-         s. push_str ( & peeking_take_while ( chars,  |ch| { 
1883-             self . dialect . is_identifier_part ( ch) 
1884-         } ) ) ; 
1885-         s
1896+     /// Tokenize an identifier or keyword, after the first char(s) have already been consumed. 
1897+      /// `consumed_byte_len` is the byte length of the consumed character(s). 
1898+      fn  tokenize_word ( & self ,  consumed_byte_len :  usize ,  chars :  & mut  State < ' a > )  -> String  { 
1899+         // Calculate where the first character started 
1900+         let  first_char_byte_pos = chars. byte_pos  - consumed_byte_len; 
1901+ 
1902+         // Use the zero-copy version and convert to String 
1903+         self . tokenize_word_borrowed ( first_char_byte_pos,  chars) 
1904+             . to_string ( ) 
1905+     } 
1906+ 
1907+     /// Tokenize an identifier or keyword, returning a borrowed slice when possible. 
1908+      /// The first character position must be provided (before it was consumed). 
1909+      /// Returns a slice with the same lifetime as the State's source. 
1910+      fn  tokenize_word_borrowed ( & self ,  first_char_byte_pos :  usize ,  chars :  & mut  State < ' a > )  -> & ' a  str  { 
1911+         // Consume the rest of the word 
1912+         borrow_slice_until ( chars,  |ch| self . dialect . is_identifier_part ( ch) ) ; 
1913+ 
1914+         // Return a slice from the first char to the current position 
1915+         & chars. source [ first_char_byte_pos..chars. byte_pos ] 
18861916    } 
18871917
18881918    /// Read a quoted identifier 
@@ -2176,35 +2206,82 @@ impl<'a> Tokenizer<'a> {
21762206/// Read from `chars` until `predicate` returns `false` or EOF is hit. 
21772207/// Return the characters read as String, and keep the first non-matching 
21782208/// char available as `chars.next()`. 
2179- fn  peeking_take_while ( chars :  & mut  State ,  mut  predicate :  impl  FnMut ( char )  -> bool )  -> String  { 
2180-     let  mut  s = String :: new ( ) ; 
2209+ fn  peeking_take_while ( chars :  & mut  State ,  predicate :  impl  FnMut ( char )  -> bool )  -> String  { 
2210+     borrow_slice_until ( chars,  predicate) . to_string ( ) 
2211+ } 
2212+ 
2213+ /// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit. 
2214+ /// 
2215+ /// # Arguments 
2216+ /// * `chars` - The character iterator state (contains reference to original source) 
2217+ /// * `predicate` - Function that returns true while we should continue taking characters 
2218+ /// 
2219+ /// # Returns 
2220+ /// A borrowed slice of the source string containing the matched characters 
2221+ fn  borrow_slice_until < ' a > ( 
2222+     chars :  & mut  State < ' a > , 
2223+     mut  predicate :  impl  FnMut ( char )  -> bool , 
2224+ )  -> & ' a  str  { 
2225+     // Record the starting byte position 
2226+     let  start_pos = chars. byte_pos ; 
2227+ 
2228+     // Consume characters while predicate is true 
21812229    while  let  Some ( & ch)  = chars. peek ( )  { 
21822230        if  predicate ( ch)  { 
2183-             chars. next ( ) ;  // consume 
2184-             s. push ( ch) ; 
2231+             chars. next ( ) ;  // consume (this updates byte_pos) 
21852232        }  else  { 
21862233            break ; 
21872234        } 
21882235    } 
2189-     s
2236+ 
2237+     // Get the ending byte position 
2238+     let  end_pos = chars. byte_pos ; 
2239+ 
2240+     // Return the slice from the original source 
2241+     & chars. source [ start_pos..end_pos] 
21902242} 
21912243
2192- /// Same as peeking_take_while, but also passes the next character to the predicate. 
2193- fn  peeking_next_take_while ( 
2194-     chars :  & mut  State , 
2244+ /// Borrow a slice from the original string until `predicate` returns `false` or EOF is hit. 
2245+ /// This version also passes the next character to the predicate for lookahead. 
2246+ /// This is a zero-copy version of `peeking_next_take_while`. 
2247+ /// 
2248+ /// # Arguments 
2249+ /// * `chars` - The character iterator state (contains reference to original source) 
2250+ /// * `predicate` - Function that returns true while we should continue taking characters. 
2251+ ///   Takes current char and optional next char for lookahead. 
2252+ /// 
2253+ /// # Returns 
2254+ /// A borrowed slice of the source string containing the matched characters 
2255+ fn  borrow_slice_until_next < ' a > ( 
2256+     chars :  & mut  State < ' a > , 
21952257    mut  predicate :  impl  FnMut ( char ,  Option < char > )  -> bool , 
2196- )  -> String  { 
2197-     let  mut  s = String :: new ( ) ; 
2258+ )  -> & ' a  str  { 
2259+     // Record the starting byte position 
2260+     let  start_pos = chars. byte_pos ; 
2261+ 
2262+     // Consume characters while predicate is true 
21982263    while  let  Some ( & ch)  = chars. peek ( )  { 
2199-         let  next_char = chars. peekable . clone ( ) . nth ( 1 ) ; 
2264+         let  next_char = chars. peek_next ( ) ; 
22002265        if  predicate ( ch,  next_char)  { 
2201-             chars. next ( ) ;  // consume 
2202-             s. push ( ch) ; 
2266+             chars. next ( ) ;  // consume (this updates byte_pos) 
22032267        }  else  { 
22042268            break ; 
22052269        } 
22062270    } 
2207-     s
2271+ 
2272+     // Get the ending byte position 
2273+     let  end_pos = chars. byte_pos ; 
2274+ 
2275+     // Return the slice from the original source 
2276+     & chars. source [ start_pos..end_pos] 
2277+ } 
2278+ 
2279+ /// Same as peeking_take_while, but also passes the next character to the predicate. 
2280+ fn  peeking_next_take_while ( 
2281+     chars :  & mut  State , 
2282+     predicate :  impl  FnMut ( char ,  Option < char > )  -> bool , 
2283+ )  -> String  { 
2284+     borrow_slice_until_next ( chars,  predicate) . to_string ( ) 
22082285} 
22092286
22102287fn  unescape_single_quoted_string ( chars :  & mut  State < ' _ > )  -> Option < String >  { 
@@ -3496,8 +3573,10 @@ mod tests {
34963573        let  s = format ! ( "'{s}'" ) ; 
34973574        let  mut  state = State  { 
34983575            peekable :  s. chars ( ) . peekable ( ) , 
3576+             source :  & s, 
34993577            line :  0 , 
35003578            col :  0 , 
3579+             byte_pos :  0 , 
35013580        } ; 
35023581
35033582        assert_eq ! ( 
0 commit comments