Auto merge of rust-lang#88781 - estebank:emoji-idents, r=oli-obk

bors · bors · commit 23a436606b11 · 2021-11-25T08:16:08.000Z
Tokenize emoji as if they were valid identifiers In the lexer, consider emojis to be valid identifiers and reject them later to avoid knock down parse errors. Partially address rust-lang#86102.
diff --git a/Cargo.lock b/Cargo.lock
@@ -4040,6 +4040,7 @@ name = "rustc_lexer"
 version = "0.1.0"
 dependencies = [
  "expect-test",
+ "unic-emoji-char",
  "unicode-xid",
 ]
 
@@ -5510,6 +5511,47 @@ version = "0.1.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
 
+[[package]]
+name = "unic-char-property"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
+dependencies = [
+ "unic-char-range",
+]
+
+[[package]]
+name = "unic-char-range"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"
+
+[[package]]
+name = "unic-common"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
+
+[[package]]
+name = "unic-emoji-char"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d"
+dependencies = [
+ "unic-char-property",
+ "unic-char-range",
+ "unic-ucd-version",
+]
+
+[[package]]
+name = "unic-ucd-version"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
+dependencies = [
+ "unic-common",
+]
+
 [[package]]
 name = "unicase"
 version = "2.6.0"
diff --git a/compiler/rustc_errors/src/emitter.rs b/compiler/rustc_errors/src/emitter.rs
@@ -730,7 +730,7 @@ impl EmitterWriter {
         }
 
         let source_string = match file.get_line(line.line_index - 1) {
-            Some(s) => replace_tabs(&*s),
+            Some(s) => normalize_whitespace(&*s),
             None => return Vec::new(),
         };
 
@@ -1286,7 +1286,7 @@ impl EmitterWriter {
             }
             for &(ref text, _) in msg.iter() {
                 // Account for newlines to align output to its label.
-                for (line, text) in replace_tabs(text).lines().enumerate() {
+                for (line, text) in normalize_whitespace(text).lines().enumerate() {
                     buffer.append(
                         0 + line,
                         &format!(
@@ -1550,7 +1550,7 @@ impl EmitterWriter {
 
                             self.draw_line(
                                 &mut buffer,
-                                &replace_tabs(&unannotated_line),
+                                &normalize_whitespace(&unannotated_line),
                                 annotated_file.lines[line_idx + 1].line_index - 1,
                                 last_buffer_line_num,
                                 width_offset,
@@ -1672,7 +1672,7 @@ impl EmitterWriter {
                     buffer.puts(
                         row_num - 1,
                         max_line_num_len + 3,
-                        &replace_tabs(
+                        &normalize_whitespace(
                             &*file_lines
                                 .file
                                 .get_line(file_lines.lines[line_pos].line_index)
@@ -1698,7 +1698,7 @@ impl EmitterWriter {
                 }
 
                 // print the suggestion
-                buffer.append(row_num, &replace_tabs(line), Style::NoStyle);
+                buffer.append(row_num, &normalize_whitespace(line), Style::NoStyle);
 
                 // Colorize addition/replacements with green.
                 for &SubstitutionHighlight { start, end } in highlight_parts {
@@ -2081,6 +2081,7 @@ fn num_decimal_digits(num: usize) -> usize {
 // We replace some characters so the CLI output is always consistent and underlines aligned.
 const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
     ('\t', "    "),   // We do our own tab replacement
+    ('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
     ('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
     ('\u{202B}', ""), // supported accross CLIs and can cause confusion due to the bytes on disk
     ('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
@@ -2092,7 +2093,7 @@ const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
     ('\u{2069}', ""),
 ];
 
-fn replace_tabs(str: &str) -> String {
+fn normalize_whitespace(str: &str) -> String {
     let mut s = str.to_string();
     for (c, replacement) in OUTPUT_REPLACEMENTS {
         s = s.replace(*c, replacement);
diff --git a/compiler/rustc_interface/src/passes.rs b/compiler/rustc_interface/src/passes.rs
@@ -35,7 +35,7 @@ use rustc_session::output::{filename_for_input, filename_for_metadata};
 use rustc_session::search_paths::PathKind;
 use rustc_session::{Limit, Session};
 use rustc_span::symbol::{sym, Ident, Symbol};
-use rustc_span::FileName;
+use rustc_span::{FileName, MultiSpan};
 use rustc_trait_selection::traits;
 use rustc_typeck as typeck;
 use tempfile::Builder as TempFileBuilder;
@@ -450,6 +450,19 @@ pub fn configure_and_expand(
         });
     }
 
+    // Gate identifiers containing invalid Unicode codepoints that were recovered during lexing.
+    sess.parse_sess.bad_unicode_identifiers.with_lock(|identifiers| {
+        let mut identifiers: Vec<_> = identifiers.drain().collect();
+        identifiers.sort_by_key(|&(key, _)| key);
+        for (ident, mut spans) in identifiers.into_iter() {
+            spans.sort();
+            sess.diagnostic().span_err(
+                MultiSpan::from(spans),
+                &format!("identifiers cannot contain emoji: `{}`", ident),
+            );
+        }
+    });
+
     Ok(krate)
 }
 
diff --git a/compiler/rustc_lexer/Cargo.toml b/compiler/rustc_lexer/Cargo.toml
@@ -17,6 +17,7 @@ doctest = false
 # Note that this crate purposefully does not depend on other rustc crates
 [dependencies]
 unicode-xid = "0.2.0"
+unic-emoji-char = "0.9.0"
 
 [dev-dependencies]
 expect-test = "1.0"
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
@@ -64,6 +64,8 @@ pub enum TokenKind {
     /// "ident" or "continue"
     /// At this step keywords are also considered identifiers.
     Ident,
+    /// Like the above, but containing invalid unicode codepoints.
+    InvalidIdent,
     /// "r#ident"
     RawIdent,
     /// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the
@@ -411,6 +413,10 @@ impl Cursor<'_> {
                 let kind = Str { terminated };
                 Literal { kind, suffix_start }
             }
+            // Identifier starting with an emoji. Only lexed for graceful error recovery.
+            c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
+                self.fake_ident_or_unknown_prefix()
+            }
             _ => Unknown,
         };
         Token::new(token_kind, self.len_consumed())
@@ -492,10 +498,28 @@ impl Cursor<'_> {
         // we see a prefix here, it is definitely an unknown prefix.
         match self.first() {
             '#' | '"' | '\'' => UnknownPrefix,
+            c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
+                self.fake_ident_or_unknown_prefix()
+            }
             _ => Ident,
         }
     }
 
+    fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind {
+        // Start is already eaten, eat the rest of identifier.
+        self.eat_while(|c| {
+            unicode_xid::UnicodeXID::is_xid_continue(c)
+                || (!c.is_ascii() && unic_emoji_char::is_emoji(c))
+                || c == '\u{200d}'
+        });
+        // Known prefixes must have been handled earlier. So if
+        // we see a prefix here, it is definitely an unknown prefix.
+        match self.first() {
+            '#' | '"' | '\'' => UnknownPrefix,
+            _ => InvalidIdent,
+        }
+    }
+
     fn number(&mut self, first_digit: char) -> LiteralKind {
         debug_assert!('0' <= self.prev() && self.prev() <= '9');
         let mut base = Base::Decimal;
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
@@ -1,3 +1,4 @@
+use crate::lexer::unicode_chars::UNICODE_ARRAY;
 use rustc_ast::ast::{self, AttrStyle};
 use rustc_ast::token::{self, CommentKind, Token, TokenKind};
 use rustc_ast::tokenstream::{Spacing, TokenStream};
@@ -222,6 +223,22 @@ impl<'a> StringReader<'a> {
                 }
                 token::Ident(sym, is_raw_ident)
             }
+            rustc_lexer::TokenKind::InvalidIdent
+                // Do not recover an identifier with emoji if the codepoint is a confusable
+                // with a recoverable substitution token, like `➖`.
+                if UNICODE_ARRAY
+                    .iter()
+                    .find(|&&(c, _, _)| {
+                        let sym = self.str_from(start);
+                        sym.chars().count() == 1 && c == sym.chars().next().unwrap()
+                    })
+                    .is_none() =>
+            {
+                let sym = nfc_normalize(self.str_from(start));
+                let span = self.mk_sp(start, self.pos);
+                self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
+                token::Ident(sym, false)
+            }
             rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
                 let suffix_start = start + BytePos(suffix_start as u32);
                 let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
@@ -293,7 +310,7 @@ impl<'a> StringReader<'a> {
             rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
             rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
 
-            rustc_lexer::TokenKind::Unknown => {
+            rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
                 let c = self.str_from(start).chars().next().unwrap();
                 let mut err =
                     self.struct_fatal_span_char(start, self.pos, "unknown start of token", c);
diff --git a/compiler/rustc_parse/src/lexer/unicode_chars.rs b/compiler/rustc_parse/src/lexer/unicode_chars.rs
@@ -7,7 +7,7 @@ use rustc_errors::{Applicability, DiagnosticBuilder};
 use rustc_span::{symbol::kw, BytePos, Pos, Span};
 
 #[rustfmt::skip] // for line breaks
-const UNICODE_ARRAY: &[(char, &str, char)] = &[
+pub(crate) const UNICODE_ARRAY: &[(char, &str, char)] = &[
     (' ', "Line Separator", ' '),
     (' ', "Paragraph Separator", ' '),
     (' ', "Ogham Space mark", ' '),
diff --git a/compiler/rustc_session/src/parse.rs b/compiler/rustc_session/src/parse.rs
@@ -119,8 +119,13 @@ pub struct ParseSess {
     pub config: CrateConfig,
     pub edition: Edition,
     pub missing_fragment_specifiers: Lock<FxHashMap<Span, NodeId>>,
-    /// Places where raw identifiers were used. This is used for feature-gating raw identifiers.
+    /// Places where raw identifiers were used. This is used to avoid complaining about idents
+    /// clashing with keywords in new editions.
     pub raw_identifier_spans: Lock<Vec<Span>>,
+    /// Places where identifiers that contain invalid Unicode codepoints but that look like they
+    /// should be. Useful to avoid bad tokenization when encountering emoji. We group them to
+    /// provide a single error per unique incorrect identifier.
+    pub bad_unicode_identifiers: Lock<FxHashMap<Symbol, Vec<Span>>>,
     source_map: Lrc<SourceMap>,
     pub buffered_lints: Lock<Vec<BufferedEarlyLint>>,
     /// Contains the spans of block expressions that could have been incomplete based on the
@@ -160,6 +165,7 @@ impl ParseSess {
             edition: ExpnId::root().expn_data().edition,
             missing_fragment_specifiers: Default::default(),
             raw_identifier_spans: Lock::new(Vec::new()),
+            bad_unicode_identifiers: Lock::new(Default::default()),
             source_map,
             buffered_lints: Lock::new(vec![]),
             ambiguous_block_expr_parse: Lock::new(FxHashMap::default()),
diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs
@@ -632,7 +632,7 @@ impl<'a> Classifier<'a> {
                 },
                 Some(c) => c,
             },
-            TokenKind::RawIdent | TokenKind::UnknownPrefix => {
+            TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => {
                 Class::Ident(self.new_span(before, text))
             }
             TokenKind::Lifetime { .. } => Class::Lifetime,
diff --git a/src/test/ui/parser/emoji-identifiers.rs b/src/test/ui/parser/emoji-identifiers.rs
@@ -0,0 +1,16 @@
+struct ABig👩‍👩‍👧‍👧Family; //~ ERROR identifiers cannot contain emoji
+struct 👀; //~ ERROR identifiers cannot contain emoji
+impl 👀 {
+    fn full_of_✨() -> 👀 { //~ ERROR identifiers cannot contain emoji
+        👀
+    }
+}
+fn i_like_to_😅_a_lot() -> 👀 { //~ ERROR identifiers cannot contain emoji
+    👀::full_of✨() //~ ERROR no function or associated item named `full_of✨` found for struct `👀`
+    //~^ ERROR identifiers cannot contain emoji
+}
+fn main() {
+    let _ = i_like_to_😄_a_lot() ➖ 4; //~ ERROR cannot find function `i_like_to_😄_a_lot` in this scope
+    //~^ ERROR identifiers cannot contain emoji
+    //~| ERROR unknown start of token: \u{2796}
+}
diff --git a/src/test/ui/parser/emoji-identifiers.stderr b/src/test/ui/parser/emoji-identifiers.stderr
@@ -0,0 +1,83 @@
+error: unknown start of token: \u{2796}
+  --> $DIR/emoji-identifiers.rs:13:33
+   |
+LL |     let _ = i_like_to_😄_a_lot() ➖ 4;
+   |                                  ^^
+   |
+help: Unicode character '➖' (Heavy Minus Sign) looks like '-' (Minus/Hyphen), but it is not
+   |
+LL |     let _ = i_like_to_😄_a_lot() - 4;
+   |                                  ~
+
+error[E0425]: cannot find function `i_like_to_😄_a_lot` in this scope
+  --> $DIR/emoji-identifiers.rs:13:13
+   |
+LL | fn i_like_to_😅_a_lot() -> 👀 {
+   | ----------------------------- similarly named function `i_like_to_😅_a_lot` defined here
+...
+LL |     let _ = i_like_to_😄_a_lot() ➖ 4;
+   |             ^^^^^^^^^^^^^^^^^^ help: a function with a similar name exists: `i_like_to_😅_a_lot`
+
+error: identifiers cannot contain emoji: `ABig👩👩👧👧Family`
+  --> $DIR/emoji-identifiers.rs:1:8
+   |
+LL | struct ABig👩👩👧👧Family;
+   |        ^^^^^^^^^^^^^^^^^^
+
+error: identifiers cannot contain emoji: `👀`
+  --> $DIR/emoji-identifiers.rs:2:8
+   |
+LL | struct 👀;
+   |        ^^
+LL | impl 👀 {
+   |      ^^
+LL |     fn full_of_✨() -> 👀 {
+   |                        ^^
+LL |         👀
+   |         ^^
+...
+LL | fn i_like_to_😅_a_lot() -> 👀 {
+   |                            ^^
+LL |     👀::full_of✨()
+   |     ^^
+
+error: identifiers cannot contain emoji: `full_of_✨`
+  --> $DIR/emoji-identifiers.rs:4:8
+   |
+LL |     fn full_of_✨() -> 👀 {
+   |        ^^^^^^^^^^
+
+error: identifiers cannot contain emoji: `i_like_to_😅_a_lot`
+  --> $DIR/emoji-identifiers.rs:8:4
+   |
+LL | fn i_like_to_😅_a_lot() -> 👀 {
+   |    ^^^^^^^^^^^^^^^^^^
+
+error: identifiers cannot contain emoji: `full_of✨`
+  --> $DIR/emoji-identifiers.rs:9:8
+   |
+LL |     👀::full_of✨()
+   |         ^^^^^^^^^
+
+error: identifiers cannot contain emoji: `i_like_to_😄_a_lot`
+  --> $DIR/emoji-identifiers.rs:13:13
+   |
+LL |     let _ = i_like_to_😄_a_lot() ➖ 4;
+   |             ^^^^^^^^^^^^^^^^^^
+
+error[E0599]: no function or associated item named `full_of✨` found for struct `👀` in the current scope
+  --> $DIR/emoji-identifiers.rs:9:8
+   |
+LL | struct 👀;
+   | ---------- function or associated item `full_of✨` not found for this
+...
+LL |     👀::full_of✨()
+   |         ^^^^^^^^^
+   |         |
+   |         function or associated item not found in `👀`
+   |         help: there is an associated function with a similar name: `full_of_✨`
+
+error: aborting due to 9 previous errors
+
+Some errors have detailed explanations: E0425, E0599.
+For more information about an error, try `rustc --explain E0425`.
diff --git a/src/tools/cargo b/src/tools/cargo
@@ -1 +1 @@
-Subproject commit e1fb17631eb1b3665cdbe45b1c186111577ef512
+Subproject commit 7f08ace4f1305de7f3b1b0e2f765911957226bd4
diff --git a/src/tools/tidy/src/deps.rs b/src/tools/tidy/src/deps.rs

Original file line number	Diff line number	Diff line change
`@@ -632,7 +632,7 @@ impl<'a> Classifier<'a> {`
`632`	`632`	`},`
`633`	`633`	`Some(c) => c,`
`634`	`634`	`},`
`635`		`- TokenKind::RawIdent \| TokenKind::UnknownPrefix => {`
	`635`	`+ TokenKind::RawIdent \| TokenKind::UnknownPrefix \| TokenKind::InvalidIdent => {`
`636`	`636`	`Class::Ident(self.new_span(before, text))`
`637`	`637`	`}`
`638`	`638`	`TokenKind::Lifetime { .. } => Class::Lifetime,`