From b00472a2920b47206e0fedb73d5d515d6d957027 Mon Sep 17 00:00:00 2001 From: David Peter Date: Mon, 1 Jul 2024 20:31:10 +0200 Subject: [PATCH] Support string escape sequences --- numbat/src/parser.rs | 93 ++++++++++++++++++++++++++++++++++++-- numbat/src/pretty_print.rs | 16 ++++++- numbat/src/tokenizer.rs | 54 +++++++++++++++++++--- numbat/src/typed_ast.rs | 5 +- 4 files changed, 155 insertions(+), 13 deletions(-) diff --git a/numbat/src/parser.rs b/numbat/src/parser.rs index 0ba9ec23..458386c8 100644 --- a/numbat/src/parser.rs +++ b/numbat/src/parser.rs @@ -1339,7 +1339,7 @@ impl<'a> Parser<'a> { } else if let Some(token) = self.match_exact(TokenKind::StringFixed) { Ok(Expression::String( token.span, - vec![StringPart::Fixed(strip_first_and_last(&token.lexeme))], + vec![StringPart::Fixed(strip_and_escape(&token.lexeme))], )) } else if let Some(token) = self.match_exact(TokenKind::StringInterpolationStart) { let mut parts = Vec::new(); @@ -1358,7 +1358,7 @@ impl<'a> Parser<'a> { self.interpolation(&mut parts, inner_token)?; } TokenKind::StringInterpolationEnd => { - parts.push(StringPart::Fixed(strip_first_and_last(&inner_token.lexeme))); + parts.push(StringPart::Fixed(strip_and_escape(&inner_token.lexeme))); has_end = true; break; } @@ -1424,7 +1424,7 @@ impl<'a> Parser<'a> { } fn interpolation(&mut self, parts: &mut Vec, token: &Token) -> Result<()> { - parts.push(StringPart::Fixed(strip_first_and_last(&token.lexeme))); + parts.push(StringPart::Fixed(strip_and_escape(&token.lexeme))); let expr = self.expression()?; @@ -1728,8 +1728,40 @@ impl<'a> Parser<'a> { } } -fn strip_first_and_last(s: &str) -> String { - s[1..(s.len() - 1)].to_string() +fn strip_and_escape(s: &str) -> String { + let trimmed = &s[1..(s.len() - 1)]; + + let mut result = String::with_capacity(trimmed.len()); + let mut escaped = false; + for c in trimmed.chars() { + if escaped { + // Keep this in sync with 'escape_numbat_string', + // where the reverse replacement is needed + match c { + 'n' => result.push('\n'), + 'r' => result.push('\r'), + 't' => result.push('\t'), + '"' => result.push('"'), + '0' => result.push('\0'), + '\\' => result.push('\\'), + '{' => result.push('{'), + '}' => result.push('}'), + _ => { + // We follow Python here, where an unknown escape sequence + // does not lead to an error, but is just passed through. + result.push('\\'); + result.push(c) + } + } + escaped = false; + } else if c == '\\' { + escaped = true; + } else { + result.push(c); + } + } + + result } /// Parse a string. @@ -2751,6 +2783,57 @@ mod tests { Expression::String(Span::dummy(), vec![StringPart::Fixed("hello world".into())]), ); + parse_as_expression( + &[r#""hello \"world\"!""#], + Expression::String( + Span::dummy(), + vec![StringPart::Fixed("hello \"world\"!".into())], + ), + ); + + parse_as_expression( + &[r#""newline: \n, return: \r, tab: \t, quote: \", null: \0, backslash: \\, open_brace: \{, close brace: \}.""#], + Expression::String( + Span::dummy(), + vec![StringPart::Fixed("newline: \n, return: \r, tab: \t, quote: \", null: \0, backslash: \\, open_brace: {, close brace: }.".into())], + ), + ); + + parse_as_expression( + &[r#""\"""#], + Expression::String(Span::dummy(), vec![StringPart::Fixed("\"".into())]), + ); + + parse_as_expression( + &[r#""\\""#], + Expression::String(Span::dummy(), vec![StringPart::Fixed("\\".into())]), + ); + + parse_as_expression( + &[r#""\\\"""#], + Expression::String(Span::dummy(), vec![StringPart::Fixed("\\\"".into())]), + ); + + parse_as_expression( + &[r#""\"\\""#], + Expression::String(Span::dummy(), vec![StringPart::Fixed("\"\\".into())]), + ); + + parse_as_expression( + &[r#""\\\n""#], + Expression::String(Span::dummy(), vec![StringPart::Fixed("\\\n".into())]), + ); + + parse_as_expression( + &[r#""\n\\""#], + Expression::String(Span::dummy(), vec![StringPart::Fixed("\n\\".into())]), + ); + + parse_as_expression( + &[r#""\\n""#], + Expression::String(Span::dummy(), vec![StringPart::Fixed("\\n".into())]), + ); + parse_as_expression( &["\"pi = {pi}\""], Expression::String( diff --git a/numbat/src/pretty_print.rs b/numbat/src/pretty_print.rs index 783b6bcd..18e07c16 100644 --- a/numbat/src/pretty_print.rs +++ b/numbat/src/pretty_print.rs @@ -10,8 +10,22 @@ impl PrettyPrint for bool { } } +pub fn escape_numbat_string(s: &str) -> String { + return s + .replace("\\", "\\\\") + .replace("\n", "\\n") + .replace("\r", "\\r") + .replace("\t", "\\t") + .replace("\"", "\\\"") + .replace("\0", "\\0") + .replace("{", "\\{") + .replace("}", "\\}"); +} + impl PrettyPrint for String { fn pretty_print(&self) -> Markup { - crate::markup::operator("\"") + crate::markup::string(self) + crate::markup::operator("\"") + crate::markup::operator("\"") + + crate::markup::string(escape_numbat_string(self)) + + crate::markup::operator("\"") } } diff --git a/numbat/src/tokenizer.rs b/numbat/src/tokenizer.rs index d8f4e498..41cbb620 100644 --- a/numbat/src/tokenizer.rs +++ b/numbat/src/tokenizer.rs @@ -343,6 +343,26 @@ impl Tokenizer { Ok(()) } + fn consume_string(&mut self) -> Result<()> { + let mut escaped = false; + loop { + escaped = match self.peek() { + None => { + break; + } + Some('\\') if !escaped => true, + Some('"') | Some('{') if !escaped => { + break; + } + Some(_) => false, + }; + + self.advance(); + } + + Ok(()) + } + fn scan_single_token(&mut self) -> Result> { static KEYWORDS: OnceLock> = OnceLock::new(); let keywords = KEYWORDS.get_or_init(|| { @@ -545,9 +565,7 @@ impl Tokenizer { InterpolationState::Outside => { self.string_start = self.token_start; - while self.peek().map(|c| c != '"' && c != '{').unwrap_or(false) { - self.advance(); - } + self.consume_string()?; if self.match_char('"') { TokenKind::StringFixed @@ -606,9 +624,7 @@ impl Tokenizer { } } '}' if self.interpolation_state.is_inside() => { - while self.peek().map(|c| c != '"' && c != '{').unwrap_or(false) { - self.advance(); - } + self.consume_string()?; if self.match_char('"') { self.interpolation_state = InterpolationState::Outside; @@ -1100,6 +1116,32 @@ fn test_tokenize_string() { tokenize("\"foo = {foo, bar = {bar}\"", 0).unwrap_err().kind, TokenizerErrorKind::UnexpectedCurlyInInterpolation ); + + insta::assert_snapshot!( + tokenize_reduced_pretty(r#""start \"inner\" end""#).unwrap(), + @r###" + "\"start \\\"inner\\\" end\"", StringFixed, (1, 1) + "", Eof, (1, 22) + "### + ); + + insta::assert_snapshot!( + tokenize_reduced_pretty(r#""start \{inner\} end""#).unwrap(), + @r###" + "\"start \\{inner\\} end\"", StringFixed, (1, 1) + "", Eof, (1, 22) + "### + ); + + insta::assert_snapshot!( + tokenize_reduced_pretty(r#""start {1} \"inner\" end""#).unwrap(), + @r###" + "\"start {", StringInterpolationStart, (1, 1) + "1", Number, (1, 9) + "} \\\"inner\\\" end\"", StringInterpolationEnd, (1, 10) + "", Eof, (1, 26) + "### + ); } #[test] diff --git a/numbat/src/typed_ast.rs b/numbat/src/typed_ast.rs index 22cf04e5..2b2b1efd 100644 --- a/numbat/src/typed_ast.rs +++ b/numbat/src/typed_ast.rs @@ -5,6 +5,7 @@ use crate::arithmetic::Exponent; pub use crate::ast::{BinaryOperator, TypeExpression, UnaryOperator}; use crate::ast::{ProcedureKind, TypeAnnotation, TypeParameterBound}; use crate::dimension::DimensionRegistry; +use crate::pretty_print::escape_numbat_string; use crate::traversal::{ForAllExpressions, ForAllTypeSchemes}; use crate::type_variable::TypeVariable; use crate::typechecker::type_scheme::TypeScheme; @@ -460,7 +461,7 @@ pub enum StringPart { impl PrettyPrint for StringPart { fn pretty_print(&self) -> Markup { match self { - StringPart::Fixed(s) => m::string(s), + StringPart::Fixed(s) => m::string(escape_numbat_string(s)), StringPart::Interpolation { span: _, expr, @@ -1339,6 +1340,8 @@ mod tests { roundtrip_check("(-3)!"); roundtrip_check("megapoints"); roundtrip_check("Foo { foo: 1 meter, bar: 1 second }"); + roundtrip_check("\"foo\""); + roundtrip_check("\"newline: \\n\""); } #[test]