Skip to content

Commit 852c032

Browse files
authored
Rollup merge of rust-lang#88781 - estebank:emoji-idents, r=oli-obk
Tokenize emoji as if they were valid identifiers In the lexer, consider emojis to be valid identifiers and reject them later to avoid knock down parse errors. Partially address rust-lang#86102.
2 parents ca0c556 + 30f9807 commit 852c032

File tree

12 files changed

+231
-15
lines changed

12 files changed

+231
-15
lines changed

Diff for: β€ŽCargo.lock

+42
Original file line numberDiff line numberDiff line change
@@ -3980,6 +3980,7 @@ name = "rustc_lexer"
39803980
version = "0.1.0"
39813981
dependencies = [
39823982
"expect-test",
3983+
"unic-emoji-char",
39833984
"unicode-xid",
39843985
]
39853986

@@ -5443,6 +5444,47 @@ version = "0.1.3"
54435444
source = "registry+https://github.com/rust-lang/crates.io-index"
54445445
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"
54455446

5447+
[[package]]
5448+
name = "unic-char-property"
5449+
version = "0.9.0"
5450+
source = "registry+https://github.com/rust-lang/crates.io-index"
5451+
checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
5452+
dependencies = [
5453+
"unic-char-range",
5454+
]
5455+
5456+
[[package]]
5457+
name = "unic-char-range"
5458+
version = "0.9.0"
5459+
source = "registry+https://github.com/rust-lang/crates.io-index"
5460+
checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"
5461+
5462+
[[package]]
5463+
name = "unic-common"
5464+
version = "0.9.0"
5465+
source = "registry+https://github.com/rust-lang/crates.io-index"
5466+
checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"
5467+
5468+
[[package]]
5469+
name = "unic-emoji-char"
5470+
version = "0.9.0"
5471+
source = "registry+https://github.com/rust-lang/crates.io-index"
5472+
checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d"
5473+
dependencies = [
5474+
"unic-char-property",
5475+
"unic-char-range",
5476+
"unic-ucd-version",
5477+
]
5478+
5479+
[[package]]
5480+
name = "unic-ucd-version"
5481+
version = "0.9.0"
5482+
source = "registry+https://github.com/rust-lang/crates.io-index"
5483+
checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
5484+
dependencies = [
5485+
"unic-common",
5486+
]
5487+
54465488
[[package]]
54475489
name = "unicase"
54485490
version = "2.6.0"

Diff for: β€Žcompiler/rustc_errors/src/emitter.rs

+16-7
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,7 @@ impl EmitterWriter {
721721
}
722722

723723
let source_string = match file.get_line(line.line_index - 1) {
724-
Some(s) => replace_tabs(&*s),
724+
Some(s) => normalize_whitespace(&*s),
725725
None => return Vec::new(),
726726
};
727727

@@ -1272,7 +1272,7 @@ impl EmitterWriter {
12721272
buffer.append(0, ": ", header_style);
12731273
}
12741274
for &(ref text, _) in msg.iter() {
1275-
buffer.append(0, &replace_tabs(text), header_style);
1275+
buffer.append(0, &normalize_whitespace(text), header_style);
12761276
}
12771277
}
12781278

@@ -1526,7 +1526,7 @@ impl EmitterWriter {
15261526

15271527
self.draw_line(
15281528
&mut buffer,
1529-
&replace_tabs(&unannotated_line),
1529+
&normalize_whitespace(&unannotated_line),
15301530
annotated_file.lines[line_idx + 1].line_index - 1,
15311531
last_buffer_line_num,
15321532
width_offset,
@@ -1648,7 +1648,7 @@ impl EmitterWriter {
16481648
buffer.puts(
16491649
row_num - 1,
16501650
max_line_num_len + 3,
1651-
&replace_tabs(
1651+
&normalize_whitespace(
16521652
&*file_lines
16531653
.file
16541654
.get_line(file_lines.lines[line_pos].line_index)
@@ -1674,7 +1674,7 @@ impl EmitterWriter {
16741674
}
16751675

16761676
// print the suggestion
1677-
buffer.append(row_num, &replace_tabs(line), Style::NoStyle);
1677+
buffer.append(row_num, &normalize_whitespace(line), Style::NoStyle);
16781678

16791679
// Colorize addition/replacements with green.
16801680
for &SubstitutionHighlight { start, end } in highlight_parts {
@@ -2054,8 +2054,17 @@ fn num_decimal_digits(num: usize) -> usize {
20542054
MAX_DIGITS
20552055
}
20562056

2057-
fn replace_tabs(str: &str) -> String {
2058-
str.replace('\t', " ")
2057+
const REPLACEMENTS: &[(char, &str)] = &[
2058+
('\t', " "),
2059+
('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
2060+
];
2061+
2062+
fn normalize_whitespace(str: &str) -> String {
2063+
let mut output = str.to_string();
2064+
for (c, replacement) in REPLACEMENTS {
2065+
output = output.replace(*c, replacement);
2066+
}
2067+
output
20592068
}
20602069

20612070
fn draw_col_separator(buffer: &mut StyledBuffer, line: usize, col: usize) {

Diff for: β€Žcompiler/rustc_interface/src/passes.rs

+14-1
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ use rustc_session::output::{filename_for_input, filename_for_metadata};
3535
use rustc_session::search_paths::PathKind;
3636
use rustc_session::Session;
3737
use rustc_span::symbol::{Ident, Symbol};
38-
use rustc_span::FileName;
38+
use rustc_span::{FileName, MultiSpan};
3939
use rustc_trait_selection::traits;
4040
use rustc_typeck as typeck;
4141
use tempfile::Builder as TempFileBuilder;
@@ -445,6 +445,19 @@ pub fn configure_and_expand(
445445
}
446446
});
447447

448+
// Gate identifiers containing invalid Unicode codepoints that were recovered during lexing.
449+
sess.parse_sess.bad_unicode_identifiers.with_lock(|identifiers| {
450+
let mut identifiers: Vec<_> = identifiers.drain().collect();
451+
identifiers.sort_by_key(|&(key, _)| key);
452+
for (ident, mut spans) in identifiers.into_iter() {
453+
spans.sort();
454+
sess.diagnostic().span_err(
455+
MultiSpan::from(spans),
456+
&format!("identifiers cannot contain emoji: `{}`", ident),
457+
);
458+
}
459+
});
460+
448461
Ok(krate)
449462
}
450463

Diff for: β€Žcompiler/rustc_lexer/Cargo.toml

+1
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ doctest = false
1717
# Note that this crate purposefully does not depend on other rustc crates
1818
[dependencies]
1919
unicode-xid = "0.2.0"
20+
unic-emoji-char = "0.9.0"
2021

2122
[dev-dependencies]
2223
expect-test = "1.0"

Diff for: β€Žcompiler/rustc_lexer/src/lib.rs

+24
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,8 @@ pub enum TokenKind {
6464
/// "ident" or "continue"
6565
/// At this step keywords are also considered identifiers.
6666
Ident,
67+
/// Like the above, but containing invalid unicode codepoints.
68+
InvalidIdent,
6769
/// "r#ident"
6870
RawIdent,
6971
/// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the
@@ -411,6 +413,10 @@ impl Cursor<'_> {
411413
let kind = Str { terminated };
412414
Literal { kind, suffix_start }
413415
}
416+
// Identifier starting with an emoji. Only lexed for graceful error recovery.
417+
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
418+
self.fake_ident_or_unknown_prefix()
419+
}
414420
_ => Unknown,
415421
};
416422
Token::new(token_kind, self.len_consumed())
@@ -492,10 +498,28 @@ impl Cursor<'_> {
492498
// we see a prefix here, it is definitely an unknown prefix.
493499
match self.first() {
494500
'#' | '"' | '\'' => UnknownPrefix,
501+
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
502+
self.fake_ident_or_unknown_prefix()
503+
}
495504
_ => Ident,
496505
}
497506
}
498507

508+
fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind {
509+
// Start is already eaten, eat the rest of identifier.
510+
self.eat_while(|c| {
511+
unicode_xid::UnicodeXID::is_xid_continue(c)
512+
|| (!c.is_ascii() && unic_emoji_char::is_emoji(c))
513+
|| c == '\u{200d}'
514+
});
515+
// Known prefixes must have been handled earlier. So if
516+
// we see a prefix here, it is definitely an unknown prefix.
517+
match self.first() {
518+
'#' | '"' | '\'' => UnknownPrefix,
519+
_ => InvalidIdent,
520+
}
521+
}
522+
499523
fn number(&mut self, first_digit: char) -> LiteralKind {
500524
debug_assert!('0' <= self.prev() && self.prev() <= '9');
501525
let mut base = Base::Decimal;

Diff for: β€Žcompiler/rustc_parse/src/lexer/mod.rs

+18-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
use crate::lexer::unicode_chars::UNICODE_ARRAY;
12
use rustc_ast::ast::{self, AttrStyle};
23
use rustc_ast::token::{self, CommentKind, Token, TokenKind};
34
use rustc_ast::tokenstream::{Spacing, TokenStream};
@@ -191,6 +192,22 @@ impl<'a> StringReader<'a> {
191192
}
192193
token::Ident(sym, is_raw_ident)
193194
}
195+
rustc_lexer::TokenKind::InvalidIdent
196+
// Do not recover an identifier with emoji if the codepoint is a confusable
197+
// with a recoverable substitution token, like `βž–`.
198+
if UNICODE_ARRAY
199+
.iter()
200+
.find(|&&(c, _, _)| {
201+
let sym = self.str_from(start);
202+
sym.chars().count() == 1 && c == sym.chars().next().unwrap()
203+
})
204+
.is_none() =>
205+
{
206+
let sym = nfc_normalize(self.str_from(start));
207+
let span = self.mk_sp(start, self.pos);
208+
self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
209+
token::Ident(sym, false)
210+
}
194211
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
195212
let suffix_start = start + BytePos(suffix_start as u32);
196213
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
@@ -262,7 +279,7 @@ impl<'a> StringReader<'a> {
262279
rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
263280
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
264281

265-
rustc_lexer::TokenKind::Unknown => {
282+
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
266283
let c = self.str_from(start).chars().next().unwrap();
267284
let mut err =
268285
self.struct_fatal_span_char(start, self.pos, "unknown start of token", c);

Diff for: β€Žcompiler/rustc_parse/src/lexer/unicode_chars.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use rustc_errors::{Applicability, DiagnosticBuilder};
77
use rustc_span::{symbol::kw, BytePos, Pos, Span};
88

99
#[rustfmt::skip] // for line breaks
10-
const UNICODE_ARRAY: &[(char, &str, char)] = &[
10+
pub(crate) const UNICODE_ARRAY: &[(char, &str, char)] = &[
1111
('
', "Line Separator", ' '),
1212
('
', "Paragraph Separator", ' '),
1313
('αš€', "Ogham Space mark", ' '),

Diff for: β€Žcompiler/rustc_session/src/parse.rs

+7-1
Original file line numberDiff line numberDiff line change
@@ -119,8 +119,13 @@ pub struct ParseSess {
119119
pub config: CrateConfig,
120120
pub edition: Edition,
121121
pub missing_fragment_specifiers: Lock<FxHashMap<Span, NodeId>>,
122-
/// Places where raw identifiers were used. This is used for feature-gating raw identifiers.
122+
/// Places where raw identifiers were used. This is used to avoid complaining about idents
123+
/// clashing with keywords in new editions.
123124
pub raw_identifier_spans: Lock<Vec<Span>>,
125+
/// Places where identifiers that contain invalid Unicode codepoints but that look like they
126+
/// should be. Useful to avoid bad tokenization when encountering emoji. We group them to
127+
/// provide a single error per unique incorrect identifier.
128+
pub bad_unicode_identifiers: Lock<FxHashMap<Symbol, Vec<Span>>>,
124129
source_map: Lrc<SourceMap>,
125130
pub buffered_lints: Lock<Vec<BufferedEarlyLint>>,
126131
/// Contains the spans of block expressions that could have been incomplete based on the
@@ -160,6 +165,7 @@ impl ParseSess {
160165
edition: ExpnId::root().expn_data().edition,
161166
missing_fragment_specifiers: Default::default(),
162167
raw_identifier_spans: Lock::new(Vec::new()),
168+
bad_unicode_identifiers: Lock::new(Default::default()),
163169
source_map,
164170
buffered_lints: Lock::new(vec![]),
165171
ambiguous_block_expr_parse: Lock::new(FxHashMap::default()),

Diff for: β€Žsrc/librustdoc/html/highlight.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -489,7 +489,7 @@ impl<'a> Classifier<'a> {
489489
},
490490
Some(c) => c,
491491
},
492-
TokenKind::RawIdent | TokenKind::UnknownPrefix => {
492+
TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => {
493493
Class::Ident(self.new_span(before, text))
494494
}
495495
TokenKind::Lifetime { .. } => Class::Lifetime,

Diff for: β€Žsrc/test/ui/parser/emoji-identifiers.rs

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
struct ABigπŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§Family; //~ ERROR identifiers cannot contain emoji
2+
struct πŸ‘€; //~ ERROR identifiers cannot contain emoji
3+
impl πŸ‘€ {
4+
fn full_of_✨() -> πŸ‘€ { //~ ERROR identifiers cannot contain emoji
5+
πŸ‘€
6+
}
7+
}
8+
fn i_like_to_πŸ˜…_a_lot() -> πŸ‘€ { //~ ERROR identifiers cannot contain emoji
9+
πŸ‘€::full_of✨() //~ ERROR no function or associated item named `full_of✨` found for struct `πŸ‘€`
10+
//~^ ERROR identifiers cannot contain emoji
11+
}
12+
fn main() {
13+
let _ = i_like_to_πŸ˜„_a_lot() βž– 4; //~ ERROR cannot find function `i_like_to_πŸ˜„_a_lot` in this scope
14+
//~^ ERROR identifiers cannot contain emoji
15+
//~| ERROR unknown start of token: \u{2796}
16+
}

Diff for: β€Žsrc/test/ui/parser/emoji-identifiers.stderr

+83
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
error: unknown start of token: \u{2796}
2+
--> $DIR/emoji-identifiers.rs:13:33
3+
|
4+
LL | let _ = i_like_to_πŸ˜„_a_lot() βž– 4;
5+
| ^^
6+
|
7+
help: Unicode character 'βž–' (Heavy Minus Sign) looks like '-' (Minus/Hyphen), but it is not
8+
|
9+
LL | let _ = i_like_to_πŸ˜„_a_lot() - 4;
10+
| ~
11+
12+
error[E0425]: cannot find function `i_like_to_πŸ˜„_a_lot` in this scope
13+
--> $DIR/emoji-identifiers.rs:13:13
14+
|
15+
LL | fn i_like_to_πŸ˜…_a_lot() -> πŸ‘€ {
16+
| ----------------------------- similarly named function `i_like_to_πŸ˜…_a_lot` defined here
17+
...
18+
LL | let _ = i_like_to_πŸ˜„_a_lot() βž– 4;
19+
| ^^^^^^^^^^^^^^^^^^ help: a function with a similar name exists: `i_like_to_πŸ˜…_a_lot`
20+
21+
error: identifiers cannot contain emoji: `ABigπŸ‘©πŸ‘©πŸ‘§πŸ‘§Family`
22+
--> $DIR/emoji-identifiers.rs:1:8
23+
|
24+
LL | struct ABigπŸ‘©πŸ‘©πŸ‘§πŸ‘§Family;
25+
| ^^^^^^^^^^^^^^^^^^
26+
27+
error: identifiers cannot contain emoji: `πŸ‘€`
28+
--> $DIR/emoji-identifiers.rs:2:8
29+
|
30+
LL | struct πŸ‘€;
31+
| ^^
32+
LL | impl πŸ‘€ {
33+
| ^^
34+
LL | fn full_of_✨() -> πŸ‘€ {
35+
| ^^
36+
LL | πŸ‘€
37+
| ^^
38+
...
39+
LL | fn i_like_to_πŸ˜…_a_lot() -> πŸ‘€ {
40+
| ^^
41+
LL | πŸ‘€::full_of✨()
42+
| ^^
43+
44+
error: identifiers cannot contain emoji: `full_of_✨`
45+
--> $DIR/emoji-identifiers.rs:4:8
46+
|
47+
LL | fn full_of_✨() -> πŸ‘€ {
48+
| ^^^^^^^^^^
49+
50+
error: identifiers cannot contain emoji: `i_like_to_πŸ˜…_a_lot`
51+
--> $DIR/emoji-identifiers.rs:8:4
52+
|
53+
LL | fn i_like_to_πŸ˜…_a_lot() -> πŸ‘€ {
54+
| ^^^^^^^^^^^^^^^^^^
55+
56+
error: identifiers cannot contain emoji: `full_of✨`
57+
--> $DIR/emoji-identifiers.rs:9:8
58+
|
59+
LL | πŸ‘€::full_of✨()
60+
| ^^^^^^^^^
61+
62+
error: identifiers cannot contain emoji: `i_like_to_πŸ˜„_a_lot`
63+
--> $DIR/emoji-identifiers.rs:13:13
64+
|
65+
LL | let _ = i_like_to_πŸ˜„_a_lot() βž– 4;
66+
| ^^^^^^^^^^^^^^^^^^
67+
68+
error[E0599]: no function or associated item named `full_of✨` found for struct `πŸ‘€` in the current scope
69+
--> $DIR/emoji-identifiers.rs:9:8
70+
|
71+
LL | struct πŸ‘€;
72+
| ---------- function or associated item `full_of✨` not found for this
73+
...
74+
LL | πŸ‘€::full_of✨()
75+
| ^^^^^^^^^
76+
| |
77+
| function or associated item not found in `πŸ‘€`
78+
| help: there is an associated function with a similar name: `full_of_✨`
79+
80+
error: aborting due to 9 previous errors
81+
82+
Some errors have detailed explanations: E0425, E0599.
83+
For more information about an error, try `rustc --explain E0425`.

0 commit comments

Comments
Β (0)