diff --git a/Cargo.lock b/Cargo.lock index 51ed441d0dbe7..edc227d9db7ba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4040,6 +4040,7 @@ name = "rustc_lexer" version = "0.1.0" dependencies = [ "expect-test", + "unic-emoji-char", "unicode-xid", ] @@ -5510,6 +5511,47 @@ version = "0.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c" +[[package]] +name = "unic-char-property" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221" +dependencies = [ + "unic-char-range", +] + +[[package]] +name = "unic-char-range" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc" + +[[package]] +name = "unic-common" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc" + +[[package]] +name = "unic-emoji-char" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d" +dependencies = [ + "unic-char-property", + "unic-char-range", + "unic-ucd-version", +] + +[[package]] +name = "unic-ucd-version" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4" +dependencies = [ + "unic-common", +] + [[package]] name = "unicase" version = "2.6.0" diff --git a/compiler/rustc_errors/src/emitter.rs b/compiler/rustc_errors/src/emitter.rs index 6b79962ddd609..d64a589bd9b2a 100644 --- a/compiler/rustc_errors/src/emitter.rs +++ b/compiler/rustc_errors/src/emitter.rs @@ -730,7 +730,7 @@ impl EmitterWriter { } let source_string = match file.get_line(line.line_index - 1) { - Some(s) => replace_tabs(&*s), + Some(s) => normalize_whitespace(&*s), None => return Vec::new(), }; @@ -1286,7 +1286,7 @@ impl EmitterWriter { } for &(ref text, _) in msg.iter() { // Account for newlines to align output to its label. - for (line, text) in replace_tabs(text).lines().enumerate() { + for (line, text) in normalize_whitespace(text).lines().enumerate() { buffer.append( 0 + line, &format!( @@ -1550,7 +1550,7 @@ impl EmitterWriter { self.draw_line( &mut buffer, - &replace_tabs(&unannotated_line), + &normalize_whitespace(&unannotated_line), annotated_file.lines[line_idx + 1].line_index - 1, last_buffer_line_num, width_offset, @@ -1672,7 +1672,7 @@ impl EmitterWriter { buffer.puts( row_num - 1, max_line_num_len + 3, - &replace_tabs( + &normalize_whitespace( &*file_lines .file .get_line(file_lines.lines[line_pos].line_index) @@ -1698,7 +1698,7 @@ impl EmitterWriter { } // print the suggestion - buffer.append(row_num, &replace_tabs(line), Style::NoStyle); + buffer.append(row_num, &normalize_whitespace(line), Style::NoStyle); // Colorize addition/replacements with green. for &SubstitutionHighlight { start, end } in highlight_parts { @@ -2081,6 +2081,7 @@ fn num_decimal_digits(num: usize) -> usize { // We replace some characters so the CLI output is always consistent and underlines aligned. const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[ ('\t', " "), // We do our own tab replacement + ('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters. ('\u{202A}', ""), // The following unicode text flow control characters are inconsistently ('\u{202B}', ""), // supported accross CLIs and can cause confusion due to the bytes on disk ('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always. @@ -2092,7 +2093,7 @@ const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[ ('\u{2069}', ""), ]; -fn replace_tabs(str: &str) -> String { +fn normalize_whitespace(str: &str) -> String { let mut s = str.to_string(); for (c, replacement) in OUTPUT_REPLACEMENTS { s = s.replace(*c, replacement); diff --git a/compiler/rustc_interface/src/passes.rs b/compiler/rustc_interface/src/passes.rs index b073ee9682fbd..d3917dfb14ab3 100644 --- a/compiler/rustc_interface/src/passes.rs +++ b/compiler/rustc_interface/src/passes.rs @@ -35,7 +35,7 @@ use rustc_session::output::{filename_for_input, filename_for_metadata}; use rustc_session::search_paths::PathKind; use rustc_session::{Limit, Session}; use rustc_span::symbol::{sym, Ident, Symbol}; -use rustc_span::FileName; +use rustc_span::{FileName, MultiSpan}; use rustc_trait_selection::traits; use rustc_typeck as typeck; use tempfile::Builder as TempFileBuilder; @@ -450,6 +450,19 @@ pub fn configure_and_expand( }); } + // Gate identifiers containing invalid Unicode codepoints that were recovered during lexing. + sess.parse_sess.bad_unicode_identifiers.with_lock(|identifiers| { + let mut identifiers: Vec<_> = identifiers.drain().collect(); + identifiers.sort_by_key(|&(key, _)| key); + for (ident, mut spans) in identifiers.into_iter() { + spans.sort(); + sess.diagnostic().span_err( + MultiSpan::from(spans), + &format!("identifiers cannot contain emoji: `{}`", ident), + ); + } + }); + Ok(krate) } diff --git a/compiler/rustc_lexer/Cargo.toml b/compiler/rustc_lexer/Cargo.toml index 60c146f457b42..35af110537d4c 100644 --- a/compiler/rustc_lexer/Cargo.toml +++ b/compiler/rustc_lexer/Cargo.toml @@ -17,6 +17,7 @@ doctest = false # Note that this crate purposefully does not depend on other rustc crates [dependencies] unicode-xid = "0.2.0" +unic-emoji-char = "0.9.0" [dev-dependencies] expect-test = "1.0" diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs index b64a891cb2526..44b002fa93f42 100644 --- a/compiler/rustc_lexer/src/lib.rs +++ b/compiler/rustc_lexer/src/lib.rs @@ -64,6 +64,8 @@ pub enum TokenKind { /// "ident" or "continue" /// At this step keywords are also considered identifiers. Ident, + /// Like the above, but containing invalid unicode codepoints. + InvalidIdent, /// "r#ident" RawIdent, /// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the @@ -411,6 +413,10 @@ impl Cursor<'_> { let kind = Str { terminated }; Literal { kind, suffix_start } } + // Identifier starting with an emoji. Only lexed for graceful error recovery. + c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => { + self.fake_ident_or_unknown_prefix() + } _ => Unknown, }; Token::new(token_kind, self.len_consumed()) @@ -492,10 +498,28 @@ impl Cursor<'_> { // we see a prefix here, it is definitely an unknown prefix. match self.first() { '#' | '"' | '\'' => UnknownPrefix, + c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => { + self.fake_ident_or_unknown_prefix() + } _ => Ident, } } + fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind { + // Start is already eaten, eat the rest of identifier. + self.eat_while(|c| { + unicode_xid::UnicodeXID::is_xid_continue(c) + || (!c.is_ascii() && unic_emoji_char::is_emoji(c)) + || c == '\u{200d}' + }); + // Known prefixes must have been handled earlier. So if + // we see a prefix here, it is definitely an unknown prefix. + match self.first() { + '#' | '"' | '\'' => UnknownPrefix, + _ => InvalidIdent, + } + } + fn number(&mut self, first_digit: char) -> LiteralKind { debug_assert!('0' <= self.prev() && self.prev() <= '9'); let mut base = Base::Decimal; diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index cf35c3cd53b2f..1a620968d56a8 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -1,3 +1,4 @@ +use crate::lexer::unicode_chars::UNICODE_ARRAY; use rustc_ast::ast::{self, AttrStyle}; use rustc_ast::token::{self, CommentKind, Token, TokenKind}; use rustc_ast::tokenstream::{Spacing, TokenStream}; @@ -222,6 +223,22 @@ impl<'a> StringReader<'a> { } token::Ident(sym, is_raw_ident) } + rustc_lexer::TokenKind::InvalidIdent + // Do not recover an identifier with emoji if the codepoint is a confusable + // with a recoverable substitution token, like `โž–`. + if UNICODE_ARRAY + .iter() + .find(|&&(c, _, _)| { + let sym = self.str_from(start); + sym.chars().count() == 1 && c == sym.chars().next().unwrap() + }) + .is_none() => + { + let sym = nfc_normalize(self.str_from(start)); + let span = self.mk_sp(start, self.pos); + self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span); + token::Ident(sym, false) + } rustc_lexer::TokenKind::Literal { kind, suffix_start } => { let suffix_start = start + BytePos(suffix_start as u32); let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind); @@ -293,7 +310,7 @@ impl<'a> StringReader<'a> { rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret), rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent), - rustc_lexer::TokenKind::Unknown => { + rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => { let c = self.str_from(start).chars().next().unwrap(); let mut err = self.struct_fatal_span_char(start, self.pos, "unknown start of token", c); diff --git a/compiler/rustc_parse/src/lexer/unicode_chars.rs b/compiler/rustc_parse/src/lexer/unicode_chars.rs index 3eebc088f3fb7..ccd11f06bc582 100644 --- a/compiler/rustc_parse/src/lexer/unicode_chars.rs +++ b/compiler/rustc_parse/src/lexer/unicode_chars.rs @@ -7,7 +7,7 @@ use rustc_errors::{Applicability, DiagnosticBuilder}; use rustc_span::{symbol::kw, BytePos, Pos, Span}; #[rustfmt::skip] // for line breaks -const UNICODE_ARRAY: &[(char, &str, char)] = &[ +pub(crate) const UNICODE_ARRAY: &[(char, &str, char)] = &[ ('โ€จ', "Line Separator", ' '), ('โ€ฉ', "Paragraph Separator", ' '), ('แš€', "Ogham Space mark", ' '), diff --git a/compiler/rustc_session/src/parse.rs b/compiler/rustc_session/src/parse.rs index f7246641dca3e..d5b520325e550 100644 --- a/compiler/rustc_session/src/parse.rs +++ b/compiler/rustc_session/src/parse.rs @@ -119,8 +119,13 @@ pub struct ParseSess { pub config: CrateConfig, pub edition: Edition, pub missing_fragment_specifiers: Lock>, - /// Places where raw identifiers were used. This is used for feature-gating raw identifiers. + /// Places where raw identifiers were used. This is used to avoid complaining about idents + /// clashing with keywords in new editions. pub raw_identifier_spans: Lock>, + /// Places where identifiers that contain invalid Unicode codepoints but that look like they + /// should be. Useful to avoid bad tokenization when encountering emoji. We group them to + /// provide a single error per unique incorrect identifier. + pub bad_unicode_identifiers: Lock>>, source_map: Lrc, pub buffered_lints: Lock>, /// Contains the spans of block expressions that could have been incomplete based on the @@ -160,6 +165,7 @@ impl ParseSess { edition: ExpnId::root().expn_data().edition, missing_fragment_specifiers: Default::default(), raw_identifier_spans: Lock::new(Vec::new()), + bad_unicode_identifiers: Lock::new(Default::default()), source_map, buffered_lints: Lock::new(vec![]), ambiguous_block_expr_parse: Lock::new(FxHashMap::default()), diff --git a/src/librustdoc/html/highlight.rs b/src/librustdoc/html/highlight.rs index e177a11303643..688860f94e183 100644 --- a/src/librustdoc/html/highlight.rs +++ b/src/librustdoc/html/highlight.rs @@ -632,7 +632,7 @@ impl<'a> Classifier<'a> { }, Some(c) => c, }, - TokenKind::RawIdent | TokenKind::UnknownPrefix => { + TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => { Class::Ident(self.new_span(before, text)) } TokenKind::Lifetime { .. } => Class::Lifetime, diff --git a/src/test/ui/parser/emoji-identifiers.rs b/src/test/ui/parser/emoji-identifiers.rs new file mode 100644 index 0000000000000..ef18939bbb80c --- /dev/null +++ b/src/test/ui/parser/emoji-identifiers.rs @@ -0,0 +1,16 @@ +struct ABig๐Ÿ‘ฉโ€๐Ÿ‘ฉโ€๐Ÿ‘งโ€๐Ÿ‘งFamily; //~ ERROR identifiers cannot contain emoji +struct ๐Ÿ‘€; //~ ERROR identifiers cannot contain emoji +impl ๐Ÿ‘€ { + fn full_of_โœจ() -> ๐Ÿ‘€ { //~ ERROR identifiers cannot contain emoji + ๐Ÿ‘€ + } +} +fn i_like_to_๐Ÿ˜…_a_lot() -> ๐Ÿ‘€ { //~ ERROR identifiers cannot contain emoji + ๐Ÿ‘€::full_ofโœจ() //~ ERROR no function or associated item named `full_ofโœจ` found for struct `๐Ÿ‘€` + //~^ ERROR identifiers cannot contain emoji +} +fn main() { + let _ = i_like_to_๐Ÿ˜„_a_lot() โž– 4; //~ ERROR cannot find function `i_like_to_๐Ÿ˜„_a_lot` in this scope + //~^ ERROR identifiers cannot contain emoji + //~| ERROR unknown start of token: \u{2796} +} diff --git a/src/test/ui/parser/emoji-identifiers.stderr b/src/test/ui/parser/emoji-identifiers.stderr new file mode 100644 index 0000000000000..5f9263c4c13e7 --- /dev/null +++ b/src/test/ui/parser/emoji-identifiers.stderr @@ -0,0 +1,83 @@ +error: unknown start of token: \u{2796} + --> $DIR/emoji-identifiers.rs:13:33 + | +LL | let _ = i_like_to_๐Ÿ˜„_a_lot() โž– 4; + | ^^ + | +help: Unicode character 'โž–' (Heavy Minus Sign) looks like '-' (Minus/Hyphen), but it is not + | +LL | let _ = i_like_to_๐Ÿ˜„_a_lot() - 4; + | ~ + +error[E0425]: cannot find function `i_like_to_๐Ÿ˜„_a_lot` in this scope + --> $DIR/emoji-identifiers.rs:13:13 + | +LL | fn i_like_to_๐Ÿ˜…_a_lot() -> ๐Ÿ‘€ { + | ----------------------------- similarly named function `i_like_to_๐Ÿ˜…_a_lot` defined here +... +LL | let _ = i_like_to_๐Ÿ˜„_a_lot() โž– 4; + | ^^^^^^^^^^^^^^^^^^ help: a function with a similar name exists: `i_like_to_๐Ÿ˜…_a_lot` + +error: identifiers cannot contain emoji: `ABig๐Ÿ‘ฉ๐Ÿ‘ฉ๐Ÿ‘ง๐Ÿ‘งFamily` + --> $DIR/emoji-identifiers.rs:1:8 + | +LL | struct ABig๐Ÿ‘ฉ๐Ÿ‘ฉ๐Ÿ‘ง๐Ÿ‘งFamily; + | ^^^^^^^^^^^^^^^^^^ + +error: identifiers cannot contain emoji: `๐Ÿ‘€` + --> $DIR/emoji-identifiers.rs:2:8 + | +LL | struct ๐Ÿ‘€; + | ^^ +LL | impl ๐Ÿ‘€ { + | ^^ +LL | fn full_of_โœจ() -> ๐Ÿ‘€ { + | ^^ +LL | ๐Ÿ‘€ + | ^^ +... +LL | fn i_like_to_๐Ÿ˜…_a_lot() -> ๐Ÿ‘€ { + | ^^ +LL | ๐Ÿ‘€::full_ofโœจ() + | ^^ + +error: identifiers cannot contain emoji: `full_of_โœจ` + --> $DIR/emoji-identifiers.rs:4:8 + | +LL | fn full_of_โœจ() -> ๐Ÿ‘€ { + | ^^^^^^^^^^ + +error: identifiers cannot contain emoji: `i_like_to_๐Ÿ˜…_a_lot` + --> $DIR/emoji-identifiers.rs:8:4 + | +LL | fn i_like_to_๐Ÿ˜…_a_lot() -> ๐Ÿ‘€ { + | ^^^^^^^^^^^^^^^^^^ + +error: identifiers cannot contain emoji: `full_ofโœจ` + --> $DIR/emoji-identifiers.rs:9:8 + | +LL | ๐Ÿ‘€::full_ofโœจ() + | ^^^^^^^^^ + +error: identifiers cannot contain emoji: `i_like_to_๐Ÿ˜„_a_lot` + --> $DIR/emoji-identifiers.rs:13:13 + | +LL | let _ = i_like_to_๐Ÿ˜„_a_lot() โž– 4; + | ^^^^^^^^^^^^^^^^^^ + +error[E0599]: no function or associated item named `full_ofโœจ` found for struct `๐Ÿ‘€` in the current scope + --> $DIR/emoji-identifiers.rs:9:8 + | +LL | struct ๐Ÿ‘€; + | ---------- function or associated item `full_ofโœจ` not found for this +... +LL | ๐Ÿ‘€::full_ofโœจ() + | ^^^^^^^^^ + | | + | function or associated item not found in `๐Ÿ‘€` + | help: there is an associated function with a similar name: `full_of_โœจ` + +error: aborting due to 9 previous errors + +Some errors have detailed explanations: E0425, E0599. +For more information about an error, try `rustc --explain E0425`. diff --git a/src/tools/cargo b/src/tools/cargo index e1fb17631eb1b..7f08ace4f1305 160000 --- a/src/tools/cargo +++ b/src/tools/cargo @@ -1 +1 @@ -Subproject commit e1fb17631eb1b3665cdbe45b1c186111577ef512 +Subproject commit 7f08ace4f1305de7f3b1b0e2f765911957226bd4 diff --git a/src/tools/tidy/src/deps.rs b/src/tools/tidy/src/deps.rs index c1719a9ffe80c..bc33284f31e2e 100644 --- a/src/tools/tidy/src/deps.rs +++ b/src/tools/tidy/src/deps.rs @@ -82,8 +82,8 @@ const PERMITTED_DEPENDENCIES: &[&str] = &[ "bitflags", "block-buffer", "block-padding", - "byteorder", "byte-tools", + "byteorder", "cc", "cfg-if", "chalk-derive", @@ -140,9 +140,9 @@ const PERMITTED_DEPENDENCIES: &[&str] = &[ "memmap2", "memoffset", "miniz_oxide", - "num_cpus", "num-integer", "num-traits", + "num_cpus", "object", "odht", "once_cell", @@ -190,8 +190,8 @@ const PERMITTED_DEPENDENCIES: &[&str] = &[ "serde_json", "sha-1", "sha2", - "smallvec", "sharded-slab", + "smallvec", "snap", "stable_deref_trait", "stacker", @@ -211,6 +211,11 @@ const PERMITTED_DEPENDENCIES: &[&str] = &[ "tracing-subscriber", "tracing-tree", "typenum", + "unic-char-property", + "unic-char-range", + "unic-common", + "unic-emoji-char", + "unic-ucd-version", "unicode-normalization", "unicode-script", "unicode-security",