Skip to content

Commit

Permalink
Don't recover lifetimes/labels containing emojis as character literals
Browse files Browse the repository at this point in the history
Note that at the time of this commit, `unic-emoji-char` seems to have
data tables only up to Unicode 5.0, but Unicode is already newer than
this.

A newer emoji such as `🥺` will not be recognized as an emoji
but older emojis such as `🐱` will.
  • Loading branch information
jieyouxu committed Feb 14, 2023
1 parent c3c6d73 commit 380fa26
Show file tree
Hide file tree
Showing 5 changed files with 173 additions and 12 deletions.
2 changes: 2 additions & 0 deletions compiler/rustc_errors/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -471,6 +471,8 @@ pub enum StashKey {
/// When an invalid lifetime e.g. `'2` should be reinterpreted
/// as a char literal in the parser
LifetimeIsChar,
/// When an invalid lifetime e.g. `'🐱` contains emoji.
LifetimeContainsEmoji,
/// Maybe there was a typo where a comma was forgotten before
/// FRU syntax
MaybeFruTypo,
Expand Down
43 changes: 33 additions & 10 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ pub enum TokenKind {
Literal { kind: LiteralKind, suffix_start: u32 },

/// "'a"
Lifetime { starts_with_number: bool },
Lifetime { starts_with_number: bool, contains_emoji: bool },

// One-char tokens:
/// ";"
Expand Down Expand Up @@ -630,7 +630,13 @@ impl Cursor<'_> {
// If the first symbol is valid for identifier, it can be a lifetime.
// Also check if it's a number for a better error reporting (so '0 will
// be reported as invalid lifetime and not as unterminated char literal).
is_id_start(self.first()) || self.first().is_digit(10)
// We also have to account for potential `'🐱` emojis to avoid reporting
// it as an unterminated char literal.
is_id_start(self.first())
|| self.first().is_digit(10)
// FIXME(#108019): `unic-emoji-char` seems to have data tables only up to Unicode
// 5.0, but Unicode is already newer than this.
|| unic_emoji_char::is_emoji(self.first())
};

if !can_be_a_lifetime {
Expand All @@ -643,16 +649,33 @@ impl Cursor<'_> {
return Literal { kind, suffix_start };
}

// Either a lifetime or a character literal with
// length greater than 1.
// Either a lifetime or a character literal.

let starts_with_number = self.first().is_digit(10);
let mut contains_emoji = false;

// Skip the literal contents.
// First symbol can be a number (which isn't a valid identifier start),
// so skip it without any checks.
self.bump();
self.eat_while(is_id_continue);
// FIXME(#108019): `unic-emoji-char` seems to have data tables only up to Unicode
// 5.0, but Unicode is already newer than this.
if unic_emoji_char::is_emoji(self.first()) {
contains_emoji = true;
} else {
// Skip the literal contents.
// First symbol can be a number (which isn't a valid identifier start),
// so skip it without any checks.
self.bump();
}
self.eat_while(|c| {
if is_id_continue(c) {
true
// FIXME(#108019): `unic-emoji-char` seems to have data tables only up to Unicode
// 5.0, but Unicode is already newer than this.
} else if unic_emoji_char::is_emoji(c) {
contains_emoji = true;
true
} else {
false
}
});

// Check if after skipping literal contents we've met a closing
// single quote (which means that user attempted to create a
Expand All @@ -662,7 +685,7 @@ impl Cursor<'_> {
let kind = Char { terminated: true };
Literal { kind, suffix_start: self.pos_within_token() }
} else {
Lifetime { starts_with_number }
Lifetime { starts_with_number, contains_emoji }
}
}

Expand Down
9 changes: 7 additions & 2 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -200,16 +200,21 @@ impl<'a> StringReader<'a> {
};
token::Literal(token::Lit { kind, symbol, suffix })
}
rustc_lexer::TokenKind::Lifetime { starts_with_number } => {
rustc_lexer::TokenKind::Lifetime { starts_with_number, contains_emoji } => {
// Include the leading `'` in the real identifier, for macro
// expansion purposes. See #12512 for the gory details of why
// this is necessary.
let lifetime_name = self.str_from(start);
if starts_with_number {
let span = self.mk_sp(start, self.pos);
let mut diag = self.sess.struct_err("lifetimes cannot start with a number");
let mut diag = self.sess.struct_err("lifetimes or labels cannot start with a number");
diag.set_span(span);
diag.stash(span, StashKey::LifetimeIsChar);
} else if contains_emoji {
let span = self.mk_sp(start, self.pos);
let mut diag = self.sess.struct_err("lifetimes or labels cannot contain emojis");
diag.set_span(span);
diag.stash(span, StashKey::LifetimeContainsEmoji);
}
let ident = Symbol::intern(lifetime_name);
token::Lifetime(ident)
Expand Down
45 changes: 45 additions & 0 deletions tests/ui/lexer/issue-108019-bad-emoji-recovery.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#![allow(unused_labels)]

// FIXME(#108019): outdated Unicode table
// fn foo() {
// '🥺 loop {
// break
// }
// }

fn bar() {
'🐱 loop {
//~^ ERROR labeled expression must be followed by `:`
//~| ERROR lifetimes or labels cannot contain emojis
break
}
}

fn qux() {
'a🐱 loop {
//~^ ERROR labeled expression must be followed by `:`
//~| ERROR lifetimes or labels cannot contain emojis
break
}
}

fn quux() {
'1🐱 loop {
//~^ ERROR labeled expression must be followed by `:`
//~| ERROR lifetimes or labels cannot start with a number
break
}
}

fn x<'🐱>() -> &'🐱 () {
//~^ ERROR lifetimes or labels cannot contain emojis
//~| ERROR lifetimes or labels cannot contain emojis
&()
}

fn y() {
'a🐱: loop {}
//~^ ERROR lifetimes or labels cannot contain emojis
}

fn main() {}
86 changes: 86 additions & 0 deletions tests/ui/lexer/issue-108019-bad-emoji-recovery.stderr
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
error: labeled expression must be followed by `:`
--> $DIR/issue-108019-bad-emoji-recovery.rs:11:5
|
LL | '🐱 loop {
| ^--- help: add `:` after the label
| |
| _____the label
| |
LL | |
LL | |
LL | | break
LL | | }
| |_____^
|
= note: labels are used before loops and blocks, allowing e.g., `break 'label` to them

error: labeled expression must be followed by `:`
--> $DIR/issue-108019-bad-emoji-recovery.rs:19:5
|
LL | 'a🐱 loop {
| ^---- help: add `:` after the label
| |
| _____the label
| |
LL | |
LL | |
LL | | break
LL | | }
| |_____^
|
= note: labels are used before loops and blocks, allowing e.g., `break 'label` to them

error: labeled expression must be followed by `:`
--> $DIR/issue-108019-bad-emoji-recovery.rs:27:5
|
LL | '1🐱 loop {
| ^---- help: add `:` after the label
| |
| _____the label
| |
LL | |
LL | |
LL | | break
LL | | }
| |_____^
|
= note: labels are used before loops and blocks, allowing e.g., `break 'label` to them

error: lifetimes or labels cannot contain emojis
--> $DIR/issue-108019-bad-emoji-recovery.rs:11:5
|
LL | '🐱 loop {
| ^^^

error: lifetimes or labels cannot contain emojis
--> $DIR/issue-108019-bad-emoji-recovery.rs:19:5
|
LL | 'a🐱 loop {
| ^^^^

error: lifetimes or labels cannot start with a number
--> $DIR/issue-108019-bad-emoji-recovery.rs:27:5
|
LL | '1🐱 loop {
| ^^^^

error: lifetimes or labels cannot contain emojis
--> $DIR/issue-108019-bad-emoji-recovery.rs:34:6
|
LL | fn x<'🐱>() -> &'🐱 () {
| ^^^

error: lifetimes or labels cannot contain emojis
--> $DIR/issue-108019-bad-emoji-recovery.rs:34:16
|
LL | fn x<'🐱>() -> &'🐱 () {
| ^^^

error: lifetimes or labels cannot contain emojis
--> $DIR/issue-108019-bad-emoji-recovery.rs:41:5
|
LL | 'a🐱: loop {}
| ^^^^

error: aborting due to 9 previous errors

0 comments on commit 380fa26

Please sign in to comment.