Skip to content

Commit

Permalink
Rollup merge of rust-lang#88781 - estebank:emoji-idents, r=oli-obk
Browse files Browse the repository at this point in the history
Tokenize emoji as if they were valid identifiers

In the lexer, consider emojis to be valid identifiers and reject
them later to avoid knock down parse errors.

Partially address rust-lang#86102.
  • Loading branch information
GuillaumeGomez authored Sep 13, 2021
2 parents 0d81ac0 + 5979ed5 commit 14861f1
Show file tree
Hide file tree
Showing 12 changed files with 228 additions and 15 deletions.
42 changes: 42 additions & 0 deletions Cargo.lock
Original file line number Diff line number Diff line change
Expand Up @@ -3980,6 +3980,7 @@ name = "rustc_lexer"
version = "0.1.0"
dependencies = [
"expect-test",
"unic-emoji-char",
"unicode-xid",
]

Expand Down Expand Up @@ -5443,6 +5444,47 @@ version = "0.1.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56dee185309b50d1f11bfedef0fe6d036842e3fb77413abef29f8f8d1c5d4c1c"

[[package]]
name = "unic-char-property"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8c57a407d9b6fa02b4795eb81c5b6652060a15a7903ea981f3d723e6c0be221"
dependencies = [
"unic-char-range",
]

[[package]]
name = "unic-char-range"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0398022d5f700414f6b899e10b8348231abf9173fa93144cbc1a43b9793c1fbc"

[[package]]
name = "unic-common"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "80d7ff825a6a654ee85a63e80f92f054f904f21e7d12da4e22f9834a4aaa35bc"

[[package]]
name = "unic-emoji-char"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0b07221e68897210270a38bde4babb655869637af0f69407f96053a34f76494d"
dependencies = [
"unic-char-property",
"unic-char-range",
"unic-ucd-version",
]

[[package]]
name = "unic-ucd-version"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "96bd2f2237fe450fcd0a1d2f5f4e91711124f7857ba2e964247776ebeeb7b0c4"
dependencies = [
"unic-common",
]

[[package]]
name = "unicase"
version = "2.6.0"
Expand Down
23 changes: 16 additions & 7 deletions compiler/rustc_errors/src/emitter.rs
Original file line number Diff line number Diff line change
Expand Up @@ -721,7 +721,7 @@ impl EmitterWriter {
}

let source_string = match file.get_line(line.line_index - 1) {
Some(s) => replace_tabs(&*s),
Some(s) => normalize_whitespace(&*s),
None => return Vec::new(),
};

Expand Down Expand Up @@ -1272,7 +1272,7 @@ impl EmitterWriter {
buffer.append(0, ": ", header_style);
}
for &(ref text, _) in msg.iter() {
buffer.append(0, &replace_tabs(text), header_style);
buffer.append(0, &normalize_whitespace(text), header_style);
}
}

Expand Down Expand Up @@ -1526,7 +1526,7 @@ impl EmitterWriter {

self.draw_line(
&mut buffer,
&replace_tabs(&unannotated_line),
&normalize_whitespace(&unannotated_line),
annotated_file.lines[line_idx + 1].line_index - 1,
last_buffer_line_num,
width_offset,
Expand Down Expand Up @@ -1648,7 +1648,7 @@ impl EmitterWriter {
buffer.puts(
row_num - 1,
max_line_num_len + 3,
&replace_tabs(
&normalize_whitespace(
&*file_lines
.file
.get_line(file_lines.lines[line_pos].line_index)
Expand All @@ -1674,7 +1674,7 @@ impl EmitterWriter {
}

// print the suggestion
buffer.append(row_num, &replace_tabs(line), Style::NoStyle);
buffer.append(row_num, &normalize_whitespace(line), Style::NoStyle);

// Colorize addition/replacements with green.
for &SubstitutionHighlight { start, end } in highlight_parts {
Expand Down Expand Up @@ -2054,8 +2054,17 @@ fn num_decimal_digits(num: usize) -> usize {
MAX_DIGITS
}

fn replace_tabs(str: &str) -> String {
str.replace('\t', " ")
const REPLACEMENTS: &[(char, &str)] = &[
('\t', " "),
('\u{200D}', ""), // Replace ZWJ with nothing for consistent terminal output of grapheme clusters.
];

fn normalize_whitespace(str: &str) -> String {
let mut output = str.to_string();
for (c, replacement) in REPLACEMENTS {
output = output.replace(*c, replacement);
}
output
}

fn draw_col_separator(buffer: &mut StyledBuffer, line: usize, col: usize) {
Expand Down
12 changes: 11 additions & 1 deletion compiler/rustc_interface/src/passes.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ use rustc_session::output::{filename_for_input, filename_for_metadata};
use rustc_session::search_paths::PathKind;
use rustc_session::Session;
use rustc_span::symbol::{Ident, Symbol};
use rustc_span::FileName;
use rustc_span::{FileName, MultiSpan};
use rustc_trait_selection::traits;
use rustc_typeck as typeck;
use tempfile::Builder as TempFileBuilder;
Expand Down Expand Up @@ -445,6 +445,16 @@ pub fn configure_and_expand(
}
});

// Gate identifiers containing invalid Unicode codepoints that were recovered during lexing.
sess.parse_sess.bad_unicode_identifiers.with_lock(|identifiers| {
for (ident, spans) in identifiers.drain() {
sess.diagnostic().span_err(
MultiSpan::from(spans),
&format!("identifiers cannot contain emoji: `{}`", ident),
);
}
});

Ok(krate)
}

Expand Down
1 change: 1 addition & 0 deletions compiler/rustc_lexer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ doctest = false
# Note that this crate purposefully does not depend on other rustc crates
[dependencies]
unicode-xid = "0.2.0"
unic-emoji-char = "0.9.0"

[dev-dependencies]
expect-test = "1.0"
24 changes: 24 additions & 0 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,8 @@ pub enum TokenKind {
/// "ident" or "continue"
/// At this step keywords are also considered identifiers.
Ident,
/// Like the above, but containing invalid unicode codepoints.
InvalidIdent,
/// "r#ident"
RawIdent,
/// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the
Expand Down Expand Up @@ -411,6 +413,10 @@ impl Cursor<'_> {
let kind = Str { terminated };
Literal { kind, suffix_start }
}
// Identifier starting with an emoji. Only lexed for graceful error recovery.
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
self.fake_ident_or_unknown_prefix()
}
_ => Unknown,
};
Token::new(token_kind, self.len_consumed())
Expand Down Expand Up @@ -492,10 +498,28 @@ impl Cursor<'_> {
// we see a prefix here, it is definitely an unknown prefix.
match self.first() {
'#' | '"' | '\'' => UnknownPrefix,
c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
self.fake_ident_or_unknown_prefix()
}
_ => Ident,
}
}

fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind {
// Start is already eaten, eat the rest of identifier.
self.eat_while(|c| {
unicode_xid::UnicodeXID::is_xid_continue(c)
|| (!c.is_ascii() && unic_emoji_char::is_emoji(c))
|| c == '\u{200d}'
});
// Known prefixes must have been handled earlier. So if
// we see a prefix here, it is definitely an unknown prefix.
match self.first() {
'#' | '"' | '\'' => UnknownPrefix,
_ => InvalidIdent,
}
}

fn number(&mut self, first_digit: char) -> LiteralKind {
debug_assert!('0' <= self.prev() && self.prev() <= '9');
let mut base = Base::Decimal;
Expand Down
19 changes: 18 additions & 1 deletion compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
use crate::lexer::unicode_chars::UNICODE_ARRAY;
use rustc_ast::ast::{self, AttrStyle};
use rustc_ast::token::{self, CommentKind, Token, TokenKind};
use rustc_ast::tokenstream::{Spacing, TokenStream};
Expand Down Expand Up @@ -191,6 +192,22 @@ impl<'a> StringReader<'a> {
}
token::Ident(sym, is_raw_ident)
}
rustc_lexer::TokenKind::InvalidIdent
// Do not recover an identifier with emoji if the codepoint is a confusable
// with a recoverable substitution token, like `βž–`.
if UNICODE_ARRAY
.iter()
.find(|&&(c, _, _)| {
let sym = self.str_from(start);
sym.chars().count() == 1 && c == sym.chars().next().unwrap()
})
.is_none() =>
{
let sym = nfc_normalize(self.str_from(start));
let span = self.mk_sp(start, self.pos);
self.sess.bad_unicode_identifiers.borrow_mut().entry(sym).or_default().push(span);
token::Ident(sym, false)
}
rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
let suffix_start = start + BytePos(suffix_start as u32);
let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
Expand Down Expand Up @@ -262,7 +279,7 @@ impl<'a> StringReader<'a> {
rustc_lexer::TokenKind::Caret => token::BinOp(token::Caret),
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),

rustc_lexer::TokenKind::Unknown => {
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
let c = self.str_from(start).chars().next().unwrap();
let mut err =
self.struct_fatal_span_char(start, self.pos, "unknown start of token", c);
Expand Down
2 changes: 1 addition & 1 deletion compiler/rustc_parse/src/lexer/unicode_chars.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use rustc_errors::{Applicability, DiagnosticBuilder};
use rustc_span::{symbol::kw, BytePos, Pos, Span};

#[rustfmt::skip] // for line breaks
const UNICODE_ARRAY: &[(char, &str, char)] = &[
pub(crate) const UNICODE_ARRAY: &[(char, &str, char)] = &[
('
', "Line Separator", ' '),
('
', "Paragraph Separator", ' '),
('αš€', "Ogham Space mark", ' '),
Expand Down
8 changes: 7 additions & 1 deletion compiler/rustc_session/src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,8 +119,13 @@ pub struct ParseSess {
pub config: CrateConfig,
pub edition: Edition,
pub missing_fragment_specifiers: Lock<FxHashMap<Span, NodeId>>,
/// Places where raw identifiers were used. This is used for feature-gating raw identifiers.
/// Places where raw identifiers were used. This is used to avoid complaining about idents
/// clashing with keywords in new editions.
pub raw_identifier_spans: Lock<Vec<Span>>,
/// Places where identifiers that contain invalid Unicode codepoints but that look like they
/// should be. Useful to avoid bad tokenization when encountering emoji. We group them to
/// provide a single error per unique incorrect identifier.
pub bad_unicode_identifiers: Lock<FxHashMap<Symbol, Vec<Span>>>,
source_map: Lrc<SourceMap>,
pub buffered_lints: Lock<Vec<BufferedEarlyLint>>,
/// Contains the spans of block expressions that could have been incomplete based on the
Expand Down Expand Up @@ -160,6 +165,7 @@ impl ParseSess {
edition: ExpnId::root().expn_data().edition,
missing_fragment_specifiers: Default::default(),
raw_identifier_spans: Lock::new(Vec::new()),
bad_unicode_identifiers: Lock::new(Default::default()),
source_map,
buffered_lints: Lock::new(vec![]),
ambiguous_block_expr_parse: Lock::new(FxHashMap::default()),
Expand Down
2 changes: 1 addition & 1 deletion src/librustdoc/html/highlight.rs
Original file line number Diff line number Diff line change
Expand Up @@ -489,7 +489,7 @@ impl<'a> Classifier<'a> {
},
Some(c) => c,
},
TokenKind::RawIdent | TokenKind::UnknownPrefix => {
TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => {
Class::Ident(self.new_span(before, text))
}
TokenKind::Lifetime { .. } => Class::Lifetime,
Expand Down
16 changes: 16 additions & 0 deletions src/test/ui/parser/emoji-identifiers.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
struct ABigπŸ‘©β€πŸ‘©β€πŸ‘§β€πŸ‘§Family; //~ ERROR identifiers cannot contain emoji
struct πŸ‘€; //~ ERROR identifiers cannot contain emoji
impl πŸ‘€ {
fn full_of_✨() -> πŸ‘€ { //~ ERROR identifiers cannot contain emoji
πŸ‘€
}
}
fn i_like_to_πŸ˜…_a_lot() -> πŸ‘€ { //~ ERROR identifiers cannot contain emoji
πŸ‘€::full_of✨() //~ ERROR no function or associated item named `full_of✨` found for struct `πŸ‘€`
//~^ ERROR identifiers cannot contain emoji
}
fn main() {
let _ = i_like_to_πŸ˜„_a_lot() βž– 4; //~ ERROR cannot find function `i_like_to_πŸ˜„_a_lot` in this scope
//~^ ERROR identifiers cannot contain emoji
//~| ERROR unknown start of token: \u{2796}
}
83 changes: 83 additions & 0 deletions src/test/ui/parser/emoji-identifiers.stderr
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
error: unknown start of token: \u{2796}
--> $DIR/emoji-identifiers.rs:13:33
|
LL | let _ = i_like_to_πŸ˜„_a_lot() βž– 4;
| ^^
|
help: Unicode character 'βž–' (Heavy Minus Sign) looks like '-' (Minus/Hyphen), but it is not
|
LL | let _ = i_like_to_πŸ˜„_a_lot() - 4;
| ~

error[E0425]: cannot find function `i_like_to_πŸ˜„_a_lot` in this scope
--> $DIR/emoji-identifiers.rs:13:13
|
LL | fn i_like_to_πŸ˜…_a_lot() -> πŸ‘€ {
| ----------------------------- similarly named function `i_like_to_πŸ˜…_a_lot` defined here
...
LL | let _ = i_like_to_πŸ˜„_a_lot() βž– 4;
| ^^^^^^^^^^^^^^^^^^ help: a function with a similar name exists: `i_like_to_πŸ˜…_a_lot`

error: identifiers cannot contain emoji: `i_like_to_πŸ˜„_a_lot`
--> $DIR/emoji-identifiers.rs:13:13
|
LL | let _ = i_like_to_πŸ˜„_a_lot() βž– 4;
| ^^^^^^^^^^^^^^^^^^

error: identifiers cannot contain emoji: `full_of_✨`
--> $DIR/emoji-identifiers.rs:4:8
|
LL | fn full_of_✨() -> πŸ‘€ {
| ^^^^^^^^^^

error: identifiers cannot contain emoji: `full_of✨`
--> $DIR/emoji-identifiers.rs:9:8
|
LL | πŸ‘€::full_of✨()
| ^^^^^^^^^

error: identifiers cannot contain emoji: `πŸ‘€`
--> $DIR/emoji-identifiers.rs:2:8
|
LL | struct πŸ‘€;
| ^^
LL | impl πŸ‘€ {
| ^^
LL | fn full_of_✨() -> πŸ‘€ {
| ^^
LL | πŸ‘€
| ^^
...
LL | fn i_like_to_πŸ˜…_a_lot() -> πŸ‘€ {
| ^^
LL | πŸ‘€::full_of✨()
| ^^

error: identifiers cannot contain emoji: `i_like_to_πŸ˜…_a_lot`
--> $DIR/emoji-identifiers.rs:8:4
|
LL | fn i_like_to_πŸ˜…_a_lot() -> πŸ‘€ {
| ^^^^^^^^^^^^^^^^^^

error: identifiers cannot contain emoji: `ABigπŸ‘©πŸ‘©πŸ‘§πŸ‘§Family`
--> $DIR/emoji-identifiers.rs:1:8
|
LL | struct ABigπŸ‘©πŸ‘©πŸ‘§πŸ‘§Family;
| ^^^^^^^^^^^^^^^^^^

error[E0599]: no function or associated item named `full_of✨` found for struct `πŸ‘€` in the current scope
--> $DIR/emoji-identifiers.rs:9:8
|
LL | struct πŸ‘€;
| ---------- function or associated item `full_of✨` not found for this
...
LL | πŸ‘€::full_of✨()
| ^^^^^^^^^
| |
| function or associated item not found in `πŸ‘€`
| help: there is an associated function with a similar name: `full_of_✨`

error: aborting due to 9 previous errors

Some errors have detailed explanations: E0425, E0599.
For more information about an error, try `rustc --explain E0425`.
Loading

0 comments on commit 14861f1

Please sign in to comment.