From 0f6e776e9f0a2815fe6651a42302fd4dd03a892c Mon Sep 17 00:00:00 2001
From: IGI-111 <igi-111@protonmail.com>
Date: Tue, 26 Sep 2023 10:47:07 +0200
Subject: [PATCH] Forbid bidirectional flow control characters in literals

Forbid directional formatting characters from
[UAX #9](https://www.unicode.org/reports/tr9/#Directional_Formatting_Characters)
in literals to fix #5047.
This is similar to rustc's `text_direction_codepoint_in_literal` lint.
Such characters are already implicitly forbidden in other parts of the syntax.
---
 Cargo.lock                  |  1 +
 sway-error/src/lex_error.rs |  2 ++
 sway-parse/Cargo.toml       |  1 +
 sway-parse/src/token.rs     | 72 +++++++++++++++++++++++++++++++++++--
 4 files changed, 73 insertions(+), 3 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c0fb96a7dbe..6184b5e04df 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -5935,6 +5935,7 @@ dependencies = [
  "sway-error",
  "sway-types",
  "thiserror",
+ "unicode-bidi",
  "unicode-xid",
 ]
 
diff --git a/sway-error/src/lex_error.rs b/sway-error/src/lex_error.rs
index a1ffd183592..0db174a7df9 100644
--- a/sway-error/src/lex_error.rs
+++ b/sway-error/src/lex_error.rs
@@ -55,6 +55,8 @@ pub enum LexErrorKind {
     UnicodeEscapeOutOfRange { position: usize },
     #[error("unicode escape represents an invalid char value")]
     UnicodeEscapeInvalidCharValue { span: Span },
+    #[error("unicode text direction codepoint in literal")]
+    UnicodeTextDirInLiteral { position: usize, character: char },
     #[error("invalid escape code")]
     InvalidEscapeCode { position: usize },
     #[error("invalid u256. Only hex literals are supported")]
diff --git a/sway-parse/Cargo.toml b/sway-parse/Cargo.toml
index 03f924aa58b..a57ec780900 100644
--- a/sway-parse/Cargo.toml
+++ b/sway-parse/Cargo.toml
@@ -17,6 +17,7 @@ sway-ast = { version = "0.46.0", path = "../sway-ast" }
 sway-error = { version = "0.46.0", path = "../sway-error" }
 sway-types = { version = "0.46.0", path = "../sway-types" }
 thiserror = "1.0"
+unicode-bidi = "0.3.13"
 unicode-xid = "0.2.2"
 
 [dev-dependencies]
diff --git a/sway-parse/src/token.rs b/sway-parse/src/token.rs
index 369277bae48..9cb90721b67 100644
--- a/sway-parse/src/token.rs
+++ b/sway-parse/src/token.rs
@@ -14,6 +14,7 @@ use sway_types::{
     ast::{Delimiter, PunctKind},
     Ident, SourceId, Span, Spanned,
 };
+use unicode_bidi::format_chars::{ALM, FSI, LRE, LRI, LRM, LRO, PDF, PDI, RLE, RLI, RLM, RLO};
 use unicode_xid::UnicodeXID;
 
 #[extension_trait]
@@ -465,7 +466,7 @@ fn lex_string(
                 },
             )
         };
-        let (_, next_character) = l
+        let (next_index, next_character) = l
             .stream
             .next()
             .ok_or_else(|| unclosed_string_lit(l, l.src.len() - 1))?;
@@ -473,6 +474,16 @@ fn lex_string(
             '\\' => parse_escape_code(l)
                 .map_err(|e| e.unwrap_or_else(|| unclosed_string_lit(l, l.src.len())))?,
             '"' => break,
+            // do not allow text direction codepoints
+            ALM | FSI | LRE | LRI | LRM | LRO | PDF | PDI | RLE | RLI | RLM | RLO => {
+                let kind = LexErrorKind::UnicodeTextDirInLiteral {
+                    position: next_index,
+                    character: next_character,
+                };
+                let span = span_one(l, next_index, next_character);
+                error(l.handler, LexError { span, kind });
+                continue;
+            }
             _ => next_character,
         });
     }
@@ -507,7 +518,17 @@ fn lex_char(
         }
     };
 
-    let (_, next_char) = next(l)?;
+    let (next_index, next_char) = next(l)?;
+    // do not allow text direction codepoints
+    if let ALM | FSI | LRE | LRI | LRM | LRO | PDF | PDI | RLE | RLI | RLM | RLO = next_char {
+        let kind = LexErrorKind::UnicodeTextDirInLiteral {
+            position: next_index,
+            character: next_char,
+        };
+        let span = span_one(l, next_index, next_char);
+        error(l.handler, LexError { span, kind });
+    }
+
     let parsed = escape(l, next_char)?;
 
     // Consume the closing `'`.
@@ -817,7 +838,52 @@ mod tests {
             TokenTree,
         },
     };
-    use sway_error::handler::Handler;
+    use sway_error::{
+        error::CompileError,
+        handler::Handler,
+        lex_error::{LexError, LexErrorKind},
+    };
+
+    #[test]
+    fn lex_bidi() {
+        let input = "
+            script;
+            use std::string::String;
+            fn main() {
+                let a = String::from_ascii_str(\"fuel\");
+                let b = String::from_ascii_str(\"fuel\u{202E}\u{2066}// Same string again\u{2069}\u{2066}\");
+                if a.as_bytes() == b.as_bytes() {
+                    log(\"same\");
+                } else {
+                    log(\"different\");
+                }
+                let lrm = '\u{202E}';
+                log(lrm);
+            }
+        ";
+        let start = 0;
+        let end = input.len();
+        let path = None;
+        let handler = Handler::default();
+        let _stream = lex_commented(&handler, &Arc::from(input), start, end, &path).unwrap();
+        let (errors, warnings) = handler.consume();
+        assert_eq!(warnings.len(), 0);
+        assert_eq!(errors.len(), 5);
+        for err in errors {
+            assert_matches!(
+                err,
+                CompileError::Lex {
+                    error: LexError {
+                        span: _,
+                        kind: LexErrorKind::UnicodeTextDirInLiteral {
+                            position: _,
+                            character: _
+                        }
+                    }
+                }
+            );
+        }
+    }
 
     #[test]
     fn lex_commented_token_stream() {