Skip to content

Commit 23a4366

Browse files
committed
Auto merge of rust-lang#88781 - estebank:emoji-idents, r=oli-obk
Tokenize emoji as if they were valid identifiers In the lexer, consider emojis to be valid identifiers and reject them later to avoid knock down parse errors. Partially address rust-lang#86102.
2 parents c6eda7d + d929164 commit 23a4366

File tree

13 files changed

+223
-15
lines changed

13 files changed

+223
-15
lines changed

Diff for: Cargo.lock

+42
Original file line numberDiff line numberDiff line change
@@ -4040,6 +4040,7 @@ name = "rustc_lexer"
40404040
version = "0.1.0"
40414041
dependencies = [
40424042
"expect-test",
4043+
"unic-emoji-char",
40434044
"unicode-xid",
40444045
]
40454046

@@ -5510,6 +5511,47 @@ version = "0.1.3"
55105511
source = "registry+https://github.com/rust-lang/crates.io-index"
55115512
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
55125513

5514+
[[package]]
5515+
name = "unic-char-property"
5516+
version = "0.9.0"
5517+
source = "registry+https://github.com/rust-lang/crates.io-index"
5518+
checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
5519+
dependencies = [
5520+
"unic-char-range",
5521+
]
5522+
5523+
[[package]]
5524+
name = "unic-char-range"
5525+
version = "0.9.0"
5526+
source = "registry+https://github.com/rust-lang/crates.io-index"
5527+
checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"
5528+
5529+
[[package]]
5530+
name = "unic-common"
5531+
version = "0.9.0"
5532+
source = "registry+https://github.com/rust-lang/crates.io-index"
5533+
checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
5534+
5535+
[[package]]
5536+
name = "unic-emoji-char"
5537+
version = "0.9.0"
5538+
source = "registry+https://github.com/rust-lang/crates.io-index"
5539+
checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d"
5540+
dependencies = [
5541+
"unic-char-property",
5542+
"unic-char-range",
5543+
"unic-ucd-version",
5544+
]
5545+
5546+
[[package]]
5547+
name = "unic-ucd-version"
5548+
version = "0.9.0"
5549+
source = "registry+https://github.com/rust-lang/crates.io-index"
5550+
checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
5551+
dependencies = [
5552+
"unic-common",
5553+
]
5554+
55135555
[[package]]
55145556
name = "unicase"
55155557
version = "2.6.0"

Diff for: compiler/rustc_errors/src/emitter.rs

+7-6
Original file line numberDiff line numberDiff line change
@@ -730,7 +730,7 @@ impl EmitterWriter {
730730
}
731731

732732
let source_string = match file.get_line(line.line_index - 1) {
733-
Some(s) => replace_tabs(&*s),
733+
Some(s) => normalize_whitespace(&*s),
734734
None => return Vec::new(),
735735
};
736736

@@ -1286,7 +1286,7 @@ impl EmitterWriter {
12861286
}
12871287
for &(ref text, _) in msg.iter() {
12881288
// Account for newlines to align output to its label.
1289-
for (line, text) in replace_tabs(text).lines().enumerate() {
1289+
for (line, text) in normalize_whitespace(text).lines().enumerate() {
12901290
buffer.append(
12911291
0 + line,
12921292
&format!(
@@ -1550,7 +1550,7 @@ impl EmitterWriter {
15501550

15511551
self.draw_line(
15521552
&mut buffer,
1553-
&replace_tabs(&unannotated_line),
1553+
&normalize_whitespace(&unannotated_line),
15541554
annotated_file.lines[line_idx + 1].line_index - 1,
15551555
last_buffer_line_num,
15561556
width_offset,
@@ -1672,7 +1672,7 @@ impl EmitterWriter {
16721672
buffer.puts(
16731673
row_num - 1,
16741674
max_line_num_len + 3,
1675-
&replace_tabs(
1675+
&normalize_whitespace(
16761676
&*file_lines
16771677
.file
16781678
.get_line(file_lines.lines[line_pos].line_index)
@@ -1698,7 +1698,7 @@ impl EmitterWriter {
16981698
}
16991699

17001700
// print the suggestion
1701-
buffer.append(row_num, &replace_tabs(line), Style::NoStyle);
1701+
buffer.append(row_num, &normalize_whitespace(line), Style::NoStyle);
17021702

17031703
// Colorize addition/replacements with green.
17041704
for &SubstitutionHighlight { start, end } in highlight_parts {
@@ -2081,6 +2081,7 @@ fn num_decimal_digits(num: usize) -> usize {
20812081
// We replace some characters so the CLI output is always consistent and underlines aligned.
20822082
const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
20832083
('\t', " "), // We do our own tab replacement
2084+
('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
20842085
('\u{202A}', ""), // The following unicode text flow control characters are inconsistently
20852086
('\u{202B}', ""), // supported accross CLIs and can cause confusion due to the bytes on disk
20862087
('\u{202D}', ""), // not corresponding to the visible source code, so we replace them always.
@@ -2092,7 +2093,7 @@ const OUTPUT_REPLACEMENTS: &[(char, &str)] = &[
20922093
('\u{2069}', ""),
20932094
];
20942095

2095-
fn replace_tabs(str: &str) -> String {
2096+
fn normalize_whitespace(str: &str) -> String {
20962097
let mut s = str.to_string();
20972098
for (c, replacement) in OUTPUT_REPLACEMENTS {
20982099
s = s.replace(*c, replacement);

Diff for: compiler/rustc_interface/src/passes.rs

+14-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ use rustc_session::output::{filename_for_input, filename_for_metadata};
3535
use rustc_session::search_paths::PathKind;
3636
use rustc_session::{Limit, Session};
3737
use rustc_span::symbol::{sym, Ident, Symbol};
38-
use rustc_span::FileName;
38+
use rustc_span::{FileName, MultiSpan};
3939
use rustc_trait_selection::traits;
4040
use rustc_typeck as typeck;
4141
use tempfile::Builder as TempFileBuilder;
@@ -450,6 +450,19 @@ pub fn configure_and_expand(
450450
});
451451
}
452452

453+
// Gate identifiers containing invalid Unicode codepoints that were recovered during lexing.
454+
sess.parse_sess.bad_unicode_identifiers.with_lock(|identifiers| {
455+
let mut identifiers: Vec<_> = identifiers.drain().collect();
456+
identifiers.sort_by_key(|&(key, _)| key);
457+
for (ident, mut spans) in identifiers.into_iter() {
458+
spans.sort();
459+
sess.diagnostic().span_err(
460+
MultiSpan::from(spans),
461+
&format!("identifiers cannot contain emoji: `{}`", ident),
462+
);
463+
}
464+
});
465+
453466
Ok(krate)
454467
}
455468

Diff for: compiler/rustc_lexer/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ doctest = false
1717
# Note that this crate purposefully does not depend on other rustc crates
1818
[dependencies]
1919
unicode-xid = "0.2.0"
20+
unic-emoji-char = "0.9.0"
2021

2122
[dev-dependencies]
2223
expect-test = "1.0"

Diff for: compiler/rustc_lexer/src/lib.rs

+24
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ pub enum TokenKind {
6464
/// "ident" or "continue"
6565
/// At this step keywords are also considered identifiers.
6666
Ident,
67+
/// Like the above, but containing invalid unicode codepoints.
68+
InvalidIdent,
6769
/// "r#ident"
6870
RawIdent,
6971
/// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the
@@ -411,6 +413,10 @@ impl Cursor<'_> {
411413
let kind = Str { terminated };
412414
Literal { kind, suffix_start }
413415
}
416+
// Identifier starting with an emoji. Only lexed for graceful error recovery.
417+
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
418+
self.fake_ident_or_unknown_prefix()
419+
}
414420
_ => Unknown,
415421
};
416422
Token::new(token_kind, self.len_consumed())
@@ -492,10 +498,28 @@ impl Cursor<'_> {
492498
// we see a prefix here, it is definitely an unknown prefix.
493499
match self.first() {
494500
'#' | '"' | '\'' => UnknownPrefix,
501+
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
502+
self.fake_ident_or_unknown_prefix()
503+
}
495504
_ => Ident,
496505
}
497506
}
498507

508+
fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind {
509+
// Start is already eaten, eat the rest of identifier.
510+
self.eat_while(|c| {
511+
unicode_xid::UnicodeXID::is_xid_continue(c)
512+
|| (!c.is_ascii() && unic_emoji_char::is_emoji(c))
513+
|| c == '\u{200d}'
514+
});
515+
// Known prefixes must have been handled earlier. So if
516+
// we see a prefix here, it is definitely an unknown prefix.
517+
match self.first() {
518+
'#' | '"' | '\'' => UnknownPrefix,
519+
_ => InvalidIdent,
520+
}
521+
}
522+
499523
fn number(&mut self, first_digit: char) -> LiteralKind {
500524
debug_assert!('0' <= self.prev() && self.prev() <= '9');
501525
let mut base = Base::Decimal;

Diff for: compiler/rustc_parse/src/lexer/mod.rs

+18-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use crate::lexer::unicode_chars::UNICODE_ARRAY;
12
use rustc_ast::ast::{self, AttrStyle};
23
use rustc_ast::token::{self, CommentKind, Token, TokenKind};
34
use rustc_ast::tokenstream::{Spacing, TokenStream};
@@ -222,6 +223,22 @@ impl<'a> StringReader<'a> {
222223
}
223224
token::Ident(sym, is_raw_ident)
224225
}
226+
rustc_lexer::TokenKind::InvalidIdent
227+
// Do not recover an identifier with emoji if the codepoint is a confusable
228+
// with a recoverable substitution token, like `➖`.
229+
if UNICODE_ARRAY
230+
.iter()
231+
.find(|&&(c, _, _)| {
232+
let sym = self.str_from(start);
233+
sym.chars().count() == 1 && c == sym.chars().next().unwrap()
234+
})
235+
.is_none() =>
236+
{
237+
let sym = nfc_normalize(self.str_from(start));
238+
let span = self.mk_sp(start, self.pos);
239+
self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
240+
token::Ident(sym, false)
241+
}
225242
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
226243
let suffix_start = start + BytePos(suffix_start as u32);
227244
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
@@ -293,7 +310,7 @@ impl<'a> StringReader<'a> {
293310
rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
294311
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
295312

296-
rustc_lexer::TokenKind::Unknown => {
313+
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
297314
let c = self.str_from(start).chars().next().unwrap();
298315
let mut err =
299316
self.struct_fatal_span_char(start, self.pos, "unknown start of token", c);

Diff for: compiler/rustc_parse/src/lexer/unicode_chars.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use rustc_errors::{Applicability, DiagnosticBuilder};
77
use rustc_span::{symbol::kw, BytePos, Pos, Span};
88

99
#[rustfmt::skip] // for line breaks
10-
const UNICODE_ARRAY: &[(char, &str, char)] = &[
10+
pub(crate) const UNICODE_ARRAY: &[(char, &str, char)] = &[
1111
('
', "Line Separator", ' '),
1212
('
', "Paragraph Separator", ' '),
1313
(' ', "Ogham Space mark", ' '),

Diff for: compiler/rustc_session/src/parse.rs

+7-1
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,13 @@ pub struct ParseSess {
119119
pub config: CrateConfig,
120120
pub edition: Edition,
121121
pub missing_fragment_specifiers: Lock<FxHashMap<Span, NodeId>>,
122-
/// Places where raw identifiers were used. This is used for feature-gating raw identifiers.
122+
/// Places where raw identifiers were used. This is used to avoid complaining about idents
123+
/// clashing with keywords in new editions.
123124
pub raw_identifier_spans: Lock<Vec<Span>>,
125+
/// Places where identifiers that contain invalid Unicode codepoints but that look like they
126+
/// should be. Useful to avoid bad tokenization when encountering emoji. We group them to
127+
/// provide a single error per unique incorrect identifier.
128+
pub bad_unicode_identifiers: Lock<FxHashMap<Symbol, Vec<Span>>>,
124129
source_map: Lrc<SourceMap>,
125130
pub buffered_lints: Lock<Vec<BufferedEarlyLint>>,
126131
/// Contains the spans of block expressions that could have been incomplete based on the
@@ -160,6 +165,7 @@ impl ParseSess {
160165
edition: ExpnId::root().expn_data().edition,
161166
missing_fragment_specifiers: Default::default(),
162167
raw_identifier_spans: Lock::new(Vec::new()),
168+
bad_unicode_identifiers: Lock::new(Default::default()),
163169
source_map,
164170
buffered_lints: Lock::new(vec![]),
165171
ambiguous_block_expr_parse: Lock::new(FxHashMap::default()),

Diff for: src/librustdoc/html/highlight.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -632,7 +632,7 @@ impl<'a> Classifier<'a> {
632632
},
633633
Some(c) => c,
634634
},
635-
TokenKind::RawIdent | TokenKind::UnknownPrefix => {
635+
TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => {
636636
Class::Ident(self.new_span(before, text))
637637
}
638638
TokenKind::Lifetime { .. } => Class::Lifetime,

Diff for: src/test/ui/parser/emoji-identifiers.rs

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
struct ABig👩‍👩‍👧‍👧Family; //~ ERROR identifiers cannot contain emoji
2+
struct 👀; //~ ERROR identifiers cannot contain emoji
3+
impl 👀 {
4+
fn full_of_() -> 👀 { //~ ERROR identifiers cannot contain emoji
5+
👀
6+
}
7+
}
8+
fn i_like_to_😅_a_lot() -> 👀 { //~ ERROR identifiers cannot contain emoji
9+
👀::full_of() //~ ERROR no function or associated item named `full_of✨` found for struct `👀`
10+
//~^ ERROR identifiers cannot contain emoji
11+
}
12+
fn main() {
13+
let _ = i_like_to_😄_a_lot()4; //~ ERROR cannot find function `i_like_to_😄_a_lot` in this scope
14+
//~^ ERROR identifiers cannot contain emoji
15+
//~| ERROR unknown start of token: \u{2796}
16+
}

Diff for: src/test/ui/parser/emoji-identifiers.stderr

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
error: unknown start of token: \u{2796}
2+
--> $DIR/emoji-identifiers.rs:13:33
3+
|
4+
LL | let _ = i_like_to_😄_a_lot() ➖ 4;
5+
| ^^
6+
|
7+
help: Unicode character '➖' (Heavy Minus Sign) looks like '-' (Minus/Hyphen), but it is not
8+
|
9+
LL | let _ = i_like_to_😄_a_lot() - 4;
10+
| ~
11+
12+
error[E0425]: cannot find function `i_like_to_😄_a_lot` in this scope
13+
--> $DIR/emoji-identifiers.rs:13:13
14+
|
15+
LL | fn i_like_to_😅_a_lot() -> 👀 {
16+
| ----------------------------- similarly named function `i_like_to_😅_a_lot` defined here
17+
...
18+
LL | let _ = i_like_to_😄_a_lot() ➖ 4;
19+
| ^^^^^^^^^^^^^^^^^^ help: a function with a similar name exists: `i_like_to_😅_a_lot`
20+
21+
error: identifiers cannot contain emoji: `ABig👩👩👧👧Family`
22+
--> $DIR/emoji-identifiers.rs:1:8
23+
|
24+
LL | struct ABig👩👩👧👧Family;
25+
| ^^^^^^^^^^^^^^^^^^
26+
27+
error: identifiers cannot contain emoji: `👀`
28+
--> $DIR/emoji-identifiers.rs:2:8
29+
|
30+
LL | struct 👀;
31+
| ^^
32+
LL | impl 👀 {
33+
| ^^
34+
LL | fn full_of_✨() -> 👀 {
35+
| ^^
36+
LL | 👀
37+
| ^^
38+
...
39+
LL | fn i_like_to_😅_a_lot() -> 👀 {
40+
| ^^
41+
LL | 👀::full_of✨()
42+
| ^^
43+
44+
error: identifiers cannot contain emoji: `full_of_✨`
45+
--> $DIR/emoji-identifiers.rs:4:8
46+
|
47+
LL | fn full_of_✨() -> 👀 {
48+
| ^^^^^^^^^^
49+
50+
error: identifiers cannot contain emoji: `i_like_to_😅_a_lot`
51+
--> $DIR/emoji-identifiers.rs:8:4
52+
|
53+
LL | fn i_like_to_😅_a_lot() -> 👀 {
54+
| ^^^^^^^^^^^^^^^^^^
55+
56+
error: identifiers cannot contain emoji: `full_of✨`
57+
--> $DIR/emoji-identifiers.rs:9:8
58+
|
59+
LL | 👀::full_of✨()
60+
| ^^^^^^^^^
61+
62+
error: identifiers cannot contain emoji: `i_like_to_😄_a_lot`
63+
--> $DIR/emoji-identifiers.rs:13:13
64+
|
65+
LL | let _ = i_like_to_😄_a_lot() ➖ 4;
66+
| ^^^^^^^^^^^^^^^^^^
67+
68+
error[E0599]: no function or associated item named `full_of✨` found for struct `👀` in the current scope
69+
--> $DIR/emoji-identifiers.rs:9:8
70+
|
71+
LL | struct 👀;
72+
| ---------- function or associated item `full_of✨` not found for this
73+
...
74+
LL | 👀::full_of✨()
75+
| ^^^^^^^^^
76+
| |
77+
| function or associated item not found in `👀`
78+
| help: there is an associated function with a similar name: `full_of_✨`
79+
80+
error: aborting due to 9 previous errors
81+
82+
Some errors have detailed explanations: E0425, E0599.
83+
For more information about an error, try `rustc --explain E0425`.

Diff for: src/tools/cargo

0 commit comments

Comments
 (0)