From b21518cfa010a18c7871fa1772d824b43979779a Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Sat, 9 Sep 2023 12:34:48 +0200 Subject: [PATCH 1/2] Avoid allocating in lex_decimal --- crates/ruff_python_parser/src/lexer.rs | 127 ++++++++++++++++--------- 1 file changed, 83 insertions(+), 44 deletions(-) diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index 7d736d5ba7921..52717e6b7bc1b 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -28,7 +28,6 @@ //! //! [Lexical analysis]: https://docs.python.org/3/reference/lexical_analysis.html -use std::borrow::Cow; use std::iter::FusedIterator; use std::{char, cmp::Ordering, str::FromStr}; @@ -263,9 +262,10 @@ impl<'source> Lexer<'source> { 'x' | 'o' | 'b' )); - let value_text = self.radix_run(None, radix); + let mut number = LexedText::new(self.offset(), self.source); + self.radix_run(&mut number, radix); let value = - BigInt::from_str_radix(&value_text, radix.as_u32()).map_err(|e| LexicalError { + BigInt::from_str_radix(number.as_str(), radix.as_u32()).map_err(|e| LexicalError { error: LexicalErrorType::OtherError(format!("{e:?}")), location: self.token_range().start(), })?; @@ -278,15 +278,14 @@ impl<'source> Lexer<'source> { debug_assert!(self.cursor.previous().is_ascii_digit() || self.cursor.previous() == '.'); let start_is_zero = first_digit_or_dot == '0'; - let mut value_text = if first_digit_or_dot == '.' { - String::new() - } else { - self.radix_run(Some(first_digit_or_dot), Radix::Decimal) - .into_owned() + let mut number = LexedText::new(self.token_start(), self.source); + if first_digit_or_dot != '.' { + number.push_matching(first_digit_or_dot); + self.radix_run(&mut number, Radix::Decimal); }; let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') { - value_text.push('.'); + number.push_matching('.'); if self.cursor.eat_char('_') { return Err(LexicalError { @@ -295,7 +294,7 @@ impl<'source> Lexer<'source> { }); } - value_text.push_str(&self.radix_run(None, Radix::Decimal)); + self.radix_run(&mut number, Radix::Decimal); true } else { // Normal number: @@ -304,14 +303,18 @@ impl<'source> Lexer<'source> { let is_float = match self.cursor.rest().as_bytes() { [b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => { - value_text.push('e'); - self.cursor.bump(); // e | E + if self.cursor.bump() == Some('e') { + number.push_matching('e'); + } else { + // 'E' + number.push_different('e'); + } if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) { - value_text.push(sign); + number.push_matching(sign); } - value_text.push_str(&self.radix_run(None, Radix::Decimal)); + self.radix_run(&mut number, Radix::Decimal); true } @@ -320,7 +323,7 @@ impl<'source> Lexer<'source> { if is_float { // Improvement: Use `Cow` instead of pushing to value text - let value = f64::from_str(&value_text).map_err(|_| LexicalError { + let value = f64::from_str(number.as_str()).map_err(|_| LexicalError { error: LexicalErrorType::OtherError("Invalid decimal literal".to_owned()), location: self.token_start(), })?; @@ -337,10 +340,10 @@ impl<'source> Lexer<'source> { } else { // Parse trailing 'j': if self.cursor.eat_if(|c| matches!(c, 'j' | 'J')).is_some() { - let imag = f64::from_str(&value_text).unwrap(); + let imag = f64::from_str(number.as_str()).unwrap(); Ok(Tok::Complex { real: 0.0, imag }) } else { - let value = value_text.parse::().unwrap(); + let value = number.as_str().parse::().unwrap(); if start_is_zero && !value.is_zero() { // leading zeros in decimal integer literals are not permitted return Err(LexicalError { @@ -356,34 +359,19 @@ impl<'source> Lexer<'source> { /// Consume a sequence of numbers with the given radix, /// the digits can be decorated with underscores /// like this: '`1_2_3_4`' == '1234' - fn radix_run(&mut self, first: Option, radix: Radix) -> Cow<'source, str> { - let start = if let Some(first) = first { - self.offset() - first.text_len() - } else { - self.offset() - }; - self.cursor.eat_while(|c| radix.is_digit(c)); - - let number = &self.source[TextRange::new(start, self.offset())]; - - // Number that contains `_` separators. Remove them from the parsed text. - if radix.is_digit(self.cursor.second()) && self.cursor.eat_char('_') { - let mut value_text = number.to_string(); - - loop { - if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) { - value_text.push(c); - } else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) { - // Skip over `_` - self.cursor.bump(); - } else { - break; - } + fn radix_run(&mut self, number: &mut LexedText, radix: Radix) { + loop { + if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) { + number.push_matching(c); + } + // Number that contains `_` separators. Remove them from the parsed text. + else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) { + // Skip over `_` + self.cursor.bump(); + number.skip_char(); + } else { + break; } - - Cow::Owned(value_text) - } else { - Cow::Borrowed(number) } } @@ -1236,6 +1224,57 @@ const fn is_python_whitespace(c: char) -> bool { ) } +enum LexedText<'a> { + Source { source: &'a str, range: TextRange }, + Owned(String), +} + +impl<'a> LexedText<'a> { + fn new(start: TextSize, source: &'a str) -> Self { + Self::Source { + range: TextRange::empty(start), + source, + } + } + + fn push_matching(&mut self, c: char) { + match self { + LexedText::Source { range, .. } => *range = range.add_end(c.text_len()), + LexedText::Owned(owned) => owned.push(c), + } + } + + fn push_different(&mut self, c: char) { + match self { + LexedText::Source { range, source } => { + let mut text = source[*range].to_string(); + text.push(c); + *self = LexedText::Owned(text); + } + LexedText::Owned(text) => text.push(c), + }; + } + + fn as_str<'b>(&'b self) -> &'b str + where + 'b: 'a, + { + match self { + LexedText::Source { range, source } => &source[*range], + LexedText::Owned(owned) => owned, + } + } + + fn skip_char(&mut self) { + match self { + LexedText::Source { range, source } => { + *self = LexedText::Owned(source[*range].to_string()); + } + LexedText::Owned(_) => {} + } + } +} + #[cfg(test)] mod tests { use num_bigint::BigInt; From 1a3f51d78fa7493bcf64c86a042f6deffd61ea4b Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Mon, 11 Sep 2023 08:27:29 +0200 Subject: [PATCH 2/2] Remove push_different and add debug assertion --- crates/ruff_python_parser/src/lexer.rs | 34 +++++++++----------------- 1 file changed, 11 insertions(+), 23 deletions(-) diff --git a/crates/ruff_python_parser/src/lexer.rs b/crates/ruff_python_parser/src/lexer.rs index 52717e6b7bc1b..ee1e5e14e068f 100644 --- a/crates/ruff_python_parser/src/lexer.rs +++ b/crates/ruff_python_parser/src/lexer.rs @@ -280,12 +280,12 @@ impl<'source> Lexer<'source> { let mut number = LexedText::new(self.token_start(), self.source); if first_digit_or_dot != '.' { - number.push_matching(first_digit_or_dot); + number.push(first_digit_or_dot); self.radix_run(&mut number, Radix::Decimal); }; let is_float = if first_digit_or_dot == '.' || self.cursor.eat_char('.') { - number.push_matching('.'); + number.push('.'); if self.cursor.eat_char('_') { return Err(LexicalError { @@ -303,15 +303,11 @@ impl<'source> Lexer<'source> { let is_float = match self.cursor.rest().as_bytes() { [b'e' | b'E', b'0'..=b'9', ..] | [b'e' | b'E', b'-' | b'+', b'0'..=b'9', ..] => { - if self.cursor.bump() == Some('e') { - number.push_matching('e'); - } else { - // 'E' - number.push_different('e'); - } + // 'e' | 'E' + number.push(self.cursor.bump().unwrap()); if let Some(sign) = self.cursor.eat_if(|c| matches!(c, '+' | '-')) { - number.push_matching(sign); + number.push(sign); } self.radix_run(&mut number, Radix::Decimal); @@ -362,7 +358,7 @@ impl<'source> Lexer<'source> { fn radix_run(&mut self, number: &mut LexedText, radix: Radix) { loop { if let Some(c) = self.cursor.eat_if(|c| radix.is_digit(c)) { - number.push_matching(c); + number.push(c); } // Number that contains `_` separators. Remove them from the parsed text. else if self.cursor.first() == '_' && radix.is_digit(self.cursor.second()) { @@ -1237,22 +1233,14 @@ impl<'a> LexedText<'a> { } } - fn push_matching(&mut self, c: char) { - match self { - LexedText::Source { range, .. } => *range = range.add_end(c.text_len()), - LexedText::Owned(owned) => owned.push(c), - } - } - - fn push_different(&mut self, c: char) { + fn push(&mut self, c: char) { match self { LexedText::Source { range, source } => { - let mut text = source[*range].to_string(); - text.push(c); - *self = LexedText::Owned(text); + *range = range.add_end(c.text_len()); + debug_assert!(source[*range].ends_with(c)); } - LexedText::Owned(text) => text.push(c), - }; + LexedText::Owned(owned) => owned.push(c), + } } fn as_str<'b>(&'b self) -> &'b str