From 2d9f0e2c50ff6131643fd0b2d5a9f65a7006f50c Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Wed, 3 Nov 2021 23:37:23 +0100
Subject: [PATCH 1/5] Optimize bidi character detection.

---
 compiler/rustc_parse/src/lexer/mod.rs | 45 +++++++++++++++++++++++----
 compiler/rustc_parse/src/lib.rs       |  1 +
 2 files changed, 40 insertions(+), 6 deletions(-)

diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index 09a3d1b902831..21d0ee60cdade 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -137,12 +137,45 @@ impl<'a> StringReader<'a> {
         // Opening delimiter of the length 2 is not included into the comment text.
         let content_start = start + BytePos(2);
         let content = self.str_from(content_start);
-        let span = self.mk_sp(start, self.pos);
-        const UNICODE_TEXT_FLOW_CHARS: &[char] = &[
-            '\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}',
-            '\u{202C}', '\u{2069}',
-        ];
-        if content.contains(UNICODE_TEXT_FLOW_CHARS) {
+
+        // Char   - UTF-8
+        // U+202A - E2 80 AA
+        // U+202B - E2 80 AB
+        // U+202C - E2 80 AC
+        // U+202D - E2 80 AD
+        // U+202E - E2 80 AE
+        // U+2066 - E2 81 A6
+        // U+2067 - E2 81 A7
+        // U+2068 - E2 81 A8
+        // U+2069 - E2 81 A9
+        let mut bytes = content.as_bytes();
+        let contains_flow_control_chars = loop {
+            match core::slice::memchr::memchr(0xE2, &bytes) {
+                Some(idx) => {
+                    // bytes are valid UTF-8 -> E2 must be followed by two bytes
+                    match bytes[idx + 1] {
+                        0x80 => {
+                            if (0xAA..=0xAE).contains(&bytes[idx + 2]) {
+                                break true;
+                            }
+                        }
+                        0x81 => {
+                            if (0xA6..=0xA9).contains(&bytes[idx + 2]) {
+                                break true;
+                            }
+                        }
+                        _ => {}
+                    }
+                    bytes = &bytes[idx + 3..];
+                }
+                None => {
+                    break false;
+                }
+            }
+        };
+
+        if contains_flow_control_chars {
+            let span = self.mk_sp(start, self.pos);
             self.sess.buffer_lint_with_diagnostic(
                 &TEXT_DIRECTION_CODEPOINT_IN_COMMENT,
                 span,
diff --git a/compiler/rustc_parse/src/lib.rs b/compiler/rustc_parse/src/lib.rs
index a40f47f895bbe..063b0183a8fd5 100644
--- a/compiler/rustc_parse/src/lib.rs
+++ b/compiler/rustc_parse/src/lib.rs
@@ -4,6 +4,7 @@
 #![feature(crate_visibility_modifier)]
 #![feature(if_let_guard)]
 #![feature(box_patterns)]
+#![feature(slice_internals)]
 #![recursion_limit = "256"]
 
 #[macro_use]

From a5b25a2cfa1adb52723fa4a5b458dd9d6272117a Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 4 Nov 2021 17:03:13 +0100
Subject: [PATCH 2/5] Create subslice as that leads to a smaller code size.

---
 compiler/rustc_parse/src/lexer/mod.rs | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index 21d0ee60cdade..c0f2863d01cd7 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -153,14 +153,15 @@ impl<'a> StringReader<'a> {
             match core::slice::memchr::memchr(0xE2, &bytes) {
                 Some(idx) => {
                     // bytes are valid UTF-8 -> E2 must be followed by two bytes
-                    match bytes[idx + 1] {
+                    let ch = &bytes[idx..idx + 3];
+                    match ch[1] {
                         0x80 => {
-                            if (0xAA..=0xAE).contains(&bytes[idx + 2]) {
+                            if (0xAA..=0xAE).contains(&ch[2]) {
                                 break true;
                             }
                         }
                         0x81 => {
-                            if (0xA6..=0xA9).contains(&bytes[idx + 2]) {
+                            if (0xA6..=0xA9).contains(&ch[2]) {
                                 break true;
                             }
                         }

From 7885233df01abf51d2947b6b466a17a1843b2a60 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 4 Nov 2021 23:31:42 +0100
Subject: [PATCH 3/5] Optimize literal, doc comment lint as well, extract
 function.

---
 compiler/rustc_ast/src/lib.rs                 |  2 +
 compiler/rustc_ast/src/util/unicode.rs        | 44 +++++++++++++++++++
 compiler/rustc_lint/src/context.rs            |  4 +-
 .../src/hidden_unicode_codepoints.rs          | 12 ++---
 compiler/rustc_parse/src/lexer/mod.rs         | 41 +----------------
 5 files changed, 54 insertions(+), 49 deletions(-)
 create mode 100644 compiler/rustc_ast/src/util/unicode.rs

diff --git a/compiler/rustc_ast/src/lib.rs b/compiler/rustc_ast/src/lib.rs
index e3c610585d978..8a8dc44489b36 100644
--- a/compiler/rustc_ast/src/lib.rs
+++ b/compiler/rustc_ast/src/lib.rs
@@ -16,6 +16,7 @@
 #![feature(nll)]
 #![feature(min_specialization)]
 #![recursion_limit = "256"]
+#![feature(slice_internals)]
 
 #[macro_use]
 extern crate rustc_macros;
@@ -25,6 +26,7 @@ pub mod util {
     pub mod comments;
     pub mod literal;
     pub mod parser;
+    pub mod unicode;
 }
 
 pub mod ast;
diff --git a/compiler/rustc_ast/src/util/unicode.rs b/compiler/rustc_ast/src/util/unicode.rs
new file mode 100644
index 0000000000000..ad73d6e4fe205
--- /dev/null
+++ b/compiler/rustc_ast/src/util/unicode.rs
@@ -0,0 +1,44 @@
+pub const TEXT_FLOW_CONTROL_CHARS: &[char] = &[
+    '\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{202C}',
+    '\u{2069}',
+];
+
+#[inline]
+pub fn contains_text_flow_control_chars(s: &str) -> bool {
+    // Char   - UTF-8
+    // U+202A - E2 80 AA
+    // U+202B - E2 80 AB
+    // U+202C - E2 80 AC
+    // U+202D - E2 80 AD
+    // U+202E - E2 80 AE
+    // U+2066 - E2 81 A6
+    // U+2067 - E2 81 A7
+    // U+2068 - E2 81 A8
+    // U+2069 - E2 81 A9
+    let mut bytes = s.as_bytes();
+    loop {
+        match core::slice::memchr::memchr(0xE2, &bytes) {
+            Some(idx) => {
+                // bytes are valid UTF-8 -> E2 must be followed by two bytes
+                let ch = &bytes[idx..idx + 3];
+                match ch[1] {
+                    0x80 => {
+                        if (0xAA..=0xAE).contains(&ch[2]) {
+                            break true;
+                        }
+                    }
+                    0x81 => {
+                        if (0xA6..=0xA9).contains(&ch[2]) {
+                            break true;
+                        }
+                    }
+                    _ => {}
+                }
+                bytes = &bytes[idx + 3..];
+            }
+            None => {
+                break false;
+            }
+        }
+    }
+}
diff --git a/compiler/rustc_lint/src/context.rs b/compiler/rustc_lint/src/context.rs
index 6fd0a5b95f9f6..4c936dec6f2cd 100644
--- a/compiler/rustc_lint/src/context.rs
+++ b/compiler/rustc_lint/src/context.rs
@@ -16,9 +16,9 @@
 
 use self::TargetLint::*;
 
-use crate::hidden_unicode_codepoints::UNICODE_TEXT_FLOW_CHARS;
 use crate::levels::{is_known_lint_tool, LintLevelsBuilder};
 use crate::passes::{EarlyLintPassObject, LateLintPassObject};
+use ast::util::unicode::TEXT_FLOW_CONTROL_CHARS;
 use rustc_ast as ast;
 use rustc_data_structures::fx::FxHashMap;
 use rustc_data_structures::sync;
@@ -602,7 +602,7 @@ pub trait LintContext: Sized {
                     let spans: Vec<_> = content
                         .char_indices()
                         .filter_map(|(i, c)| {
-                            UNICODE_TEXT_FLOW_CHARS.contains(&c).then(|| {
+                            TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
                                 let lo = span.lo() + BytePos(2 + i as u32);
                                 (c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
                             })
diff --git a/compiler/rustc_lint/src/hidden_unicode_codepoints.rs b/compiler/rustc_lint/src/hidden_unicode_codepoints.rs
index 1bcdcb806fc43..fde84be9a7c30 100644
--- a/compiler/rustc_lint/src/hidden_unicode_codepoints.rs
+++ b/compiler/rustc_lint/src/hidden_unicode_codepoints.rs
@@ -1,4 +1,5 @@
 use crate::{EarlyContext, EarlyLintPass, LintContext};
+use ast::util::unicode::{contains_text_flow_control_chars, TEXT_FLOW_CONTROL_CHARS};
 use rustc_ast as ast;
 use rustc_errors::{Applicability, SuggestionStyle};
 use rustc_span::{BytePos, Span, Symbol};
@@ -37,11 +38,6 @@ declare_lint! {
 
 declare_lint_pass!(HiddenUnicodeCodepoints => [TEXT_DIRECTION_CODEPOINT_IN_LITERAL]);
 
-crate const UNICODE_TEXT_FLOW_CHARS: &[char] = &[
-    '\u{202A}', '\u{202B}', '\u{202D}', '\u{202E}', '\u{2066}', '\u{2067}', '\u{2068}', '\u{202C}',
-    '\u{2069}',
-];
-
 impl HiddenUnicodeCodepoints {
     fn lint_text_direction_codepoint(
         &self,
@@ -57,7 +53,7 @@ impl HiddenUnicodeCodepoints {
             .as_str()
             .char_indices()
             .filter_map(|(i, c)| {
-                UNICODE_TEXT_FLOW_CHARS.contains(&c).then(|| {
+                TEXT_FLOW_CONTROL_CHARS.contains(&c).then(|| {
                     let lo = span.lo() + BytePos(i as u32 + padding);
                     (c, span.with_lo(lo).with_hi(lo + BytePos(c.len_utf8() as u32)))
                 })
@@ -131,7 +127,7 @@ impl HiddenUnicodeCodepoints {
 impl EarlyLintPass for HiddenUnicodeCodepoints {
     fn check_attribute(&mut self, cx: &EarlyContext<'_>, attr: &ast::Attribute) {
         if let ast::AttrKind::DocComment(_, comment) = attr.kind {
-            if comment.as_str().contains(UNICODE_TEXT_FLOW_CHARS) {
+            if contains_text_flow_control_chars(&comment.as_str()) {
                 self.lint_text_direction_codepoint(cx, comment, attr.span, 0, false, "doc comment");
             }
         }
@@ -142,7 +138,7 @@ impl EarlyLintPass for HiddenUnicodeCodepoints {
         let (text, span, padding) = match &expr.kind {
             ast::ExprKind::Lit(ast::Lit { token, kind, span }) => {
                 let text = token.symbol;
-                if !text.as_str().contains(UNICODE_TEXT_FLOW_CHARS) {
+                if !contains_text_flow_control_chars(&text.as_str()) {
                     return;
                 }
                 let padding = match kind {
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index c0f2863d01cd7..cf35c3cd53b2f 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -1,6 +1,7 @@
 use rustc_ast::ast::{self, AttrStyle};
 use rustc_ast::token::{self, CommentKind, Token, TokenKind};
 use rustc_ast::tokenstream::{Spacing, TokenStream};
+use rustc_ast::util::unicode::contains_text_flow_control_chars;
 use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError, PResult};
 use rustc_lexer::unescape::{self, Mode};
 use rustc_lexer::{Base, DocStyle, RawStrError};
@@ -137,45 +138,7 @@ impl<'a> StringReader<'a> {
         // Opening delimiter of the length 2 is not included into the comment text.
         let content_start = start + BytePos(2);
         let content = self.str_from(content_start);
-
-        // Char   - UTF-8
-        // U+202A - E2 80 AA
-        // U+202B - E2 80 AB
-        // U+202C - E2 80 AC
-        // U+202D - E2 80 AD
-        // U+202E - E2 80 AE
-        // U+2066 - E2 81 A6
-        // U+2067 - E2 81 A7
-        // U+2068 - E2 81 A8
-        // U+2069 - E2 81 A9
-        let mut bytes = content.as_bytes();
-        let contains_flow_control_chars = loop {
-            match core::slice::memchr::memchr(0xE2, &bytes) {
-                Some(idx) => {
-                    // bytes are valid UTF-8 -> E2 must be followed by two bytes
-                    let ch = &bytes[idx..idx + 3];
-                    match ch[1] {
-                        0x80 => {
-                            if (0xAA..=0xAE).contains(&ch[2]) {
-                                break true;
-                            }
-                        }
-                        0x81 => {
-                            if (0xA6..=0xA9).contains(&ch[2]) {
-                                break true;
-                            }
-                        }
-                        _ => {}
-                    }
-                    bytes = &bytes[idx + 3..];
-                }
-                None => {
-                    break false;
-                }
-            }
-        };
-
-        if contains_flow_control_chars {
+        if contains_text_flow_control_chars(content) {
             let span = self.mk_sp(start, self.pos);
             self.sess.buffer_lint_with_diagnostic(
                 &TEXT_DIRECTION_CODEPOINT_IN_COMMENT,

From e339e4789f919d27bf8c13ca41abf5529b4fd056 Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Thu, 4 Nov 2021 23:40:59 +0100
Subject: [PATCH 4/5] Remove now unused feature from rustc_parse

---
 compiler/rustc_parse/src/lib.rs | 1 -
 1 file changed, 1 deletion(-)

diff --git a/compiler/rustc_parse/src/lib.rs b/compiler/rustc_parse/src/lib.rs
index 063b0183a8fd5..a40f47f895bbe 100644
--- a/compiler/rustc_parse/src/lib.rs
+++ b/compiler/rustc_parse/src/lib.rs
@@ -4,7 +4,6 @@
 #![feature(crate_visibility_modifier)]
 #![feature(if_let_guard)]
 #![feature(box_patterns)]
-#![feature(slice_internals)]
 #![recursion_limit = "256"]
 
 #[macro_use]

From 39110beab03ed74d1cb97df77721c0ad2fcb165b Mon Sep 17 00:00:00 2001
From: Hans Kratz <hans@appfour.com>
Date: Fri, 5 Nov 2021 00:39:34 +0100
Subject: [PATCH 5/5] Use one match instead of a staggered match.

---
 compiler/rustc_ast/src/util/unicode.rs | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/compiler/rustc_ast/src/util/unicode.rs b/compiler/rustc_ast/src/util/unicode.rs
index ad73d6e4fe205..f009f7b300ce0 100644
--- a/compiler/rustc_ast/src/util/unicode.rs
+++ b/compiler/rustc_ast/src/util/unicode.rs
@@ -21,17 +21,8 @@ pub fn contains_text_flow_control_chars(s: &str) -> bool {
             Some(idx) => {
                 // bytes are valid UTF-8 -> E2 must be followed by two bytes
                 let ch = &bytes[idx..idx + 3];
-                match ch[1] {
-                    0x80 => {
-                        if (0xAA..=0xAE).contains(&ch[2]) {
-                            break true;
-                        }
-                    }
-                    0x81 => {
-                        if (0xA6..=0xA9).contains(&ch[2]) {
-                            break true;
-                        }
-                    }
+                match ch {
+                    [_, 0x80, 0xAA..=0xAE] | [_, 0x81, 0xA6..=0xA9] => break true,
                     _ => {}
                 }
                 bytes = &bytes[idx + 3..];