Skip to content

Remove TokenKind::InvalidPrefix #133201

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 49 additions & 57 deletions compiler/rustc_lexer/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,10 @@ impl Token {
/// Enum representing common lexeme types.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum TokenKind {
// Multi-char tokens:
/// "// comment"
/// A line comment, e.g. `// comment`.
LineComment { doc_style: Option<DocStyle> },

/// `/* block comment */`
/// A block comment, e.g. `/* block comment */`.
///
/// Block comments can be recursive, so a sequence like `/* /* */`
/// will not be considered terminated and will result in a parsing error.
Expand All @@ -70,18 +69,17 @@ pub enum TokenKind {
/// Any whitespace character sequence.
Whitespace,

/// "ident" or "continue"
///
/// At this step, keywords are also considered identifiers.
/// An identifier or keyword, e.g. `ident` or `continue`.
Ident,

/// Like the above, but containing invalid unicode codepoints.
/// An identifier that is invalid because it contains emoji.
InvalidIdent,

/// "r#ident"
/// A raw identifier, e.g. "r#ident".
RawIdent,

/// An unknown prefix, like `foo#`, `foo'`, `foo"`.
/// An unknown literal prefix, like `foo#`, `foo'`, `foo"`. Excludes
/// literal prefixes that contain emoji, which are considered "invalid".
///
/// Note that only the
/// prefix (`foo`) is included in the token, not the separator (which is
Expand All @@ -93,87 +91,83 @@ pub enum TokenKind {

/// An unknown prefix in a lifetime, like `'foo#`.
///
/// Note that like above, only the `'` and prefix are included in the token
/// Like `UnknownPrefix`, only the `'` and prefix are included in the token
/// and not the separator.
UnknownPrefixLifetime,

/// `'r#lt`, which in edition < 2021 is split into several tokens: `'r # lt`.
/// A raw lifetime, e.g. `'r#foo`. In edition < 2021 it will be split into
/// several tokens: `'r` and `#` and `foo`.
RawLifetime,

/// Similar to the above, but *always* an error on every edition. This is used
/// for emoji identifier recovery, as those are not meant to be ever accepted.
InvalidPrefix,

/// Guarded string literal prefix: `#"` or `##`.
///
/// Used for reserving "guarded strings" (RFC 3598) in edition 2024.
/// Split into the component tokens on older editions.
GuardedStrPrefix,

/// Examples: `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
/// Literals, e.g. `12u8`, `1.0e-40`, `b"123"`. Note that `_` is an invalid
/// suffix, but may be present here on string and float literals. Users of
/// this type will need to check for and reject that case.
///
/// See [LiteralKind] for more details.
Literal { kind: LiteralKind, suffix_start: u32 },

/// "'a"
/// A lifetime, e.g. `'a`.
Lifetime { starts_with_number: bool },

// One-char tokens:
/// ";"
/// `;`
Semi,
/// ","
/// `,`
Comma,
/// "."
/// `.`
Dot,
/// "("
/// `(`
OpenParen,
/// ")"
/// `)`
CloseParen,
/// "{"
/// `{`
OpenBrace,
/// "}"
/// `}`
CloseBrace,
/// "["
/// `[`
OpenBracket,
/// "]"
/// `]`
CloseBracket,
/// "@"
/// `@`
At,
/// "#"
/// `#`
Pound,
/// "~"
/// `~`
Tilde,
/// "?"
/// `?`
Question,
/// ":"
/// `:`
Colon,
/// "$"
/// `$`
Dollar,
/// "="
/// `=`
Eq,
/// "!"
/// `!`
Bang,
/// "<"
/// `<`
Lt,
/// ">"
/// `>`
Gt,
/// "-"
/// `-`
Minus,
/// "&"
/// `&`
And,
/// "|"
/// `|`
Or,
/// "+"
/// `+`
Plus,
/// "*"
/// `*`
Star,
/// "/"
/// `/`
Slash,
/// "^"
/// `^`
Caret,
/// "%"
/// `%`
Percent,

/// Unknown token, not expected by the lexer, e.g. "№"
Expand Down Expand Up @@ -468,7 +462,7 @@ impl Cursor<'_> {
Literal { kind, suffix_start }
}
// Identifier starting with an emoji. Only lexed for graceful error recovery.
c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
_ => Unknown,
};
let res = Token::new(token_kind, self.pos_within_token());
Expand Down Expand Up @@ -552,24 +546,22 @@ impl Cursor<'_> {
// we see a prefix here, it is definitely an unknown prefix.
match self.first() {
'#' | '"' | '\'' => UnknownPrefix,
c if !c.is_ascii() && c.is_emoji_char() => self.fake_ident_or_unknown_prefix(),
c if !c.is_ascii() && c.is_emoji_char() => self.invalid_ident(),
_ => Ident,
}
}

fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind {
fn invalid_ident(&mut self) -> TokenKind {
// Start is already eaten, eat the rest of identifier.
self.eat_while(|c| {
unicode_xid::UnicodeXID::is_xid_continue(c)
|| (!c.is_ascii() && c.is_emoji_char())
|| c == '\u{200d}'
const ZERO_WIDTH_JOINER: char = '\u{200d}';
is_id_continue(c) || (!c.is_ascii() && c.is_emoji_char()) || c == ZERO_WIDTH_JOINER
});
// Known prefixes must have been handled earlier. So if
// we see a prefix here, it is definitely an unknown prefix.
match self.first() {
'#' | '"' | '\'' => InvalidPrefix,
_ => InvalidIdent,
}
// An invalid identifier followed by '#' or '"' or '\'' could be
// interpreted as an invalid literal prefix. We don't bother doing that
// because the treatment of invalid identifiers and invalid prefixes
// would be the same.
InvalidIdent
}

fn c_or_byte_string(
Expand Down
5 changes: 2 additions & 3 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -213,7 +213,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
let ident = Symbol::intern(lifetime_name);
token::Lifetime(ident, IdentIsRaw::No)
}
rustc_lexer::TokenKind::InvalidIdent | rustc_lexer::TokenKind::InvalidPrefix
rustc_lexer::TokenKind::InvalidIdent
// Do not recover an identifier with emoji if the codepoint is a confusable
// with a recoverable substitution token, like `➖`.
if !UNICODE_ARRAY.iter().any(|&(c, _, _)| {
Expand Down Expand Up @@ -359,8 +359,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),

rustc_lexer::TokenKind::Unknown
| rustc_lexer::TokenKind::InvalidIdent
| rustc_lexer::TokenKind::InvalidPrefix => {
| rustc_lexer::TokenKind::InvalidIdent => {
// Don't emit diagnostics for sequences of the same invalid token
if swallow_next_invalid > 0 {
swallow_next_invalid -= 1;
Expand Down
7 changes: 3 additions & 4 deletions src/librustdoc/html/highlight.rs
Original file line number Diff line number Diff line change
Expand Up @@ -861,10 +861,9 @@ impl<'src> Classifier<'src> {
},
Some(c) => c,
},
TokenKind::RawIdent
| TokenKind::UnknownPrefix
| TokenKind::InvalidPrefix
| TokenKind::InvalidIdent => Class::Ident(self.new_span(before, text)),
TokenKind::RawIdent | TokenKind::UnknownPrefix | TokenKind::InvalidIdent => {
Class::Ident(self.new_span(before, text))
}
TokenKind::Lifetime { .. }
| TokenKind::RawLifetime
| TokenKind::UnknownPrefixLifetime => Class::Lifetime,
Expand Down
2 changes: 1 addition & 1 deletion src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ impl<'a> Converter<'a> {
rustc_lexer::TokenKind::Ident => {
SyntaxKind::from_keyword(token_text, self.edition).unwrap_or(IDENT)
}
rustc_lexer::TokenKind::InvalidPrefix | rustc_lexer::TokenKind::InvalidIdent => {
rustc_lexer::TokenKind::InvalidIdent => {
err = "Ident contains invalid characters";
IDENT
}
Expand Down
Loading