Skip to content

Commit

Permalink
Remove TokenKind::InvalidPrefix.
Browse files Browse the repository at this point in the history
It was added in rust-lang#123752 to handle some cases involving emoji, but it
isn't necessary because it's always treated the same as
`TokenKind::InvalidIdent`. This commit removes it, which makes things a
little simpler.
  • Loading branch information
nnethercote committed Nov 19, 2024
1 parent 2c7c369 commit e9a0c3c
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 21 deletions.
21 changes: 8 additions & 13 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,10 +99,6 @@ pub enum TokenKind {
/// several tokens: `'r` and `#` and `foo`.
RawLifetime,

/// Similar to the above, but *always* an error on every edition. This is used
/// for emoji identifier recovery, as those are not meant to be ever accepted.
InvalidPrefix,

/// Guarded string literal prefix: `#"` or `##`.
///
/// Used for reserving "guarded strings" (RFC 3598) in edition 2024.
Expand Down Expand Up @@ -466,7 +462,7 @@ impl Cursor<'_> {
Literal { kind, suffix_start }
}
// Identifier starting with an emoji. Only lexed for graceful error recovery.
c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident_or_prefix(),
c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
_ => Unknown,
};
let res = Token::new(token_kind, self.pos_within_token());
Expand Down Expand Up @@ -550,23 +546,22 @@ impl Cursor<'_> {
// we see a prefix here, it is definitely an unknown prefix.
match self.first() {
'#' | '"' | '\'' => UnknownPrefix,
c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident_or_prefix(),
c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
_ => Ident,
}
}

fn invalid_ident_or_prefix(&mut self) -> TokenKind {
fn invalid_ident(&mut self) -> TokenKind {
// Start is already eaten, eat the rest of identifier.
self.eat_while(|c| {
const ZERO_WIDTH_JOINER: char = '\u{200d}';
is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER
});
// Known prefixes must have been handled earlier. So if
// we see a prefix here, it is definitely an unknown prefix.
match self.first() {
'#' | '"' | '\'' => InvalidPrefix,
_ => InvalidIdent,
}
// An invalid identifier followed by '#' or '"' or '\'' could be
// interpreted as an invalid literal prefix. We don't bother doing that
// because the treatment of invalid identifiers and invalid prefixes
// would be the same.
InvalidIdent
}

fn c_or_byte_string(
Expand Down
5 changes: 2 additions & 3 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
let ident = Symbol::intern(lifetime_name);
token::Lifetime(ident, IdentIsRaw::No)
}
rustc_lexer::TokenKind::InvalidIdent | rustc_lexer::TokenKind::InvalidPrefix
rustc_lexer::TokenKind::InvalidIdent
// Do not recover an identifier with emoji if the codepoint is a confusable
// with a recoverable substitution token, like `➖`.
if !UNICODE_ARRAY.iter().any(|&(c, _, _)| {
Expand Down Expand Up @@ -359,8 +359,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),

rustc_lexer::TokenKind::Unknown
| rustc_lexer::TokenKind::InvalidIdent
| rustc_lexer::TokenKind::InvalidPrefix => {
| rustc_lexer::TokenKind::InvalidIdent => {
// Don't emit diagnostics for sequences of the same invalid token
if swallow_next_invalid > 0 {
swallow_next_invalid -= 1;
Expand Down
7 changes: 3 additions & 4 deletions src/librustdoc/html/highlight.rs
Original file line number Diff line number Diff line change
Expand Up @@ -861,10 +861,9 @@ impl<'src> Classifier<'src> {
},
Some(c) => c,
},
TokenKind::RawIdent
| TokenKind::UnknownPrefix
| TokenKind::InvalidPrefix
| TokenKind::InvalidIdent => Class::Ident(self.new_span(before, text)),
TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => {
Class::Ident(self.new_span(before, text))
}
TokenKind::Lifetime { .. }
| TokenKind::RawLifetime
| TokenKind::UnknownPrefixLifetime => Class::Lifetime,
Expand Down
2 changes: 1 addition & 1 deletion src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ impl<'a> Converter<'a> {
rustc_lexer::TokenKind::Ident => {
SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT)
}
rustc_lexer::TokenKind::InvalidPrefix | rustc_lexer::TokenKind::InvalidIdent => {
rustc_lexer::TokenKind::InvalidIdent => {
err = "Ident contains invalid characters";
IDENT
}
Expand Down

0 comments on commit e9a0c3c

Please sign in to comment.