From 0f6e776e9f0a2815fe6651a42302fd4dd03a892c Mon Sep 17 00:00:00 2001 From: IGI-111 Date: Tue, 26 Sep 2023 10:47:07 +0200 Subject: [PATCH] Forbid bidirectional flow control characters in literals Forbid directional formatting characters from [UAX #9](https://www.unicode.org/reports/tr9/#Directional_Formatting_Characters) in literals to fix #5047. This is similar to rustc's `text_direction_codepoint_in_literal` lint. Such characters are already implicitly forbidden in other parts of the syntax. --- Cargo.lock | 1 + sway-error/src/lex_error.rs | 2 ++ sway-parse/Cargo.toml | 1 + sway-parse/src/token.rs | 72 +++++++++++++++++++++++++++++++++++-- 4 files changed, 73 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c0fb96a7dbe..6184b5e04df 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5935,6 +5935,7 @@ dependencies = [ "sway-error", "sway-types", "thiserror", + "unicode-bidi", "unicode-xid", ] diff --git a/sway-error/src/lex_error.rs b/sway-error/src/lex_error.rs index a1ffd183592..0db174a7df9 100644 --- a/sway-error/src/lex_error.rs +++ b/sway-error/src/lex_error.rs @@ -55,6 +55,8 @@ pub enum LexErrorKind { UnicodeEscapeOutOfRange { position: usize }, #[error("unicode escape represents an invalid char value")] UnicodeEscapeInvalidCharValue { span: Span }, + #[error("unicode text direction codepoint in literal")] + UnicodeTextDirInLiteral { position: usize, character: char }, #[error("invalid escape code")] InvalidEscapeCode { position: usize }, #[error("invalid u256. Only hex literals are supported")] diff --git a/sway-parse/Cargo.toml b/sway-parse/Cargo.toml index 03f924aa58b..a57ec780900 100644 --- a/sway-parse/Cargo.toml +++ b/sway-parse/Cargo.toml @@ -17,6 +17,7 @@ sway-ast = { version = "0.46.0", path = "../sway-ast" } sway-error = { version = "0.46.0", path = "../sway-error" } sway-types = { version = "0.46.0", path = "../sway-types" } thiserror = "1.0" +unicode-bidi = "0.3.13" unicode-xid = "0.2.2" [dev-dependencies] diff --git a/sway-parse/src/token.rs b/sway-parse/src/token.rs index 369277bae48..9cb90721b67 100644 --- a/sway-parse/src/token.rs +++ b/sway-parse/src/token.rs @@ -14,6 +14,7 @@ use sway_types::{ ast::{Delimiter, PunctKind}, Ident, SourceId, Span, Spanned, }; +use unicode_bidi::format_chars::{ALM, FSI, LRE, LRI, LRM, LRO, PDF, PDI, RLE, RLI, RLM, RLO}; use unicode_xid::UnicodeXID; #[extension_trait] @@ -465,7 +466,7 @@ fn lex_string( }, ) }; - let (_, next_character) = l + let (next_index, next_character) = l .stream .next() .ok_or_else(|| unclosed_string_lit(l, l.src.len() - 1))?; @@ -473,6 +474,16 @@ fn lex_string( '\\' => parse_escape_code(l) .map_err(|e| e.unwrap_or_else(|| unclosed_string_lit(l, l.src.len())))?, '"' => break, + // do not allow text direction codepoints + ALM | FSI | LRE | LRI | LRM | LRO | PDF | PDI | RLE | RLI | RLM | RLO => { + let kind = LexErrorKind::UnicodeTextDirInLiteral { + position: next_index, + character: next_character, + }; + let span = span_one(l, next_index, next_character); + error(l.handler, LexError { span, kind }); + continue; + } _ => next_character, }); } @@ -507,7 +518,17 @@ fn lex_char( } }; - let (_, next_char) = next(l)?; + let (next_index, next_char) = next(l)?; + // do not allow text direction codepoints + if let ALM | FSI | LRE | LRI | LRM | LRO | PDF | PDI | RLE | RLI | RLM | RLO = next_char { + let kind = LexErrorKind::UnicodeTextDirInLiteral { + position: next_index, + character: next_char, + }; + let span = span_one(l, next_index, next_char); + error(l.handler, LexError { span, kind }); + } + let parsed = escape(l, next_char)?; // Consume the closing `'`. @@ -817,7 +838,52 @@ mod tests { TokenTree, }, }; - use sway_error::handler::Handler; + use sway_error::{ + error::CompileError, + handler::Handler, + lex_error::{LexError, LexErrorKind}, + }; + + #[test] + fn lex_bidi() { + let input = " + script; + use std::string::String; + fn main() { + let a = String::from_ascii_str(\"fuel\"); + let b = String::from_ascii_str(\"fuel\u{202E}\u{2066}// Same string again\u{2069}\u{2066}\"); + if a.as_bytes() == b.as_bytes() { + log(\"same\"); + } else { + log(\"different\"); + } + let lrm = '\u{202E}'; + log(lrm); + } + "; + let start = 0; + let end = input.len(); + let path = None; + let handler = Handler::default(); + let _stream = lex_commented(&handler, &Arc::from(input), start, end, &path).unwrap(); + let (errors, warnings) = handler.consume(); + assert_eq!(warnings.len(), 0); + assert_eq!(errors.len(), 5); + for err in errors { + assert_matches!( + err, + CompileError::Lex { + error: LexError { + span: _, + kind: LexErrorKind::UnicodeTextDirInLiteral { + position: _, + character: _ + } + } + } + ); + } + } #[test] fn lex_commented_token_stream() {