Skip to content

Even more lexer improvements #102508

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 8 commits into from
Oct 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 8 additions & 11 deletions compiler/rustc_ast/src/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -345,17 +345,14 @@ impl Token {
}

pub fn is_op(&self) -> bool {
!matches!(
self.kind,
OpenDelim(..)
| CloseDelim(..)
| Literal(..)
| DocComment(..)
| Ident(..)
| Lifetime(..)
| Interpolated(..)
| Eof
)
match self.kind {
Eq | Lt | Le | EqEq | Ne | Ge | Gt | AndAnd | OrOr | Not | Tilde | BinOp(_)
| BinOpEq(_) | At | Dot | DotDot | DotDotDot | DotDotEq | Comma | Semi | Colon
| ModSep | RArrow | LArrow | FatArrow | Pound | Dollar | Question | SingleQuote => true,

OpenDelim(..) | CloseDelim(..) | Literal(..) | DocComment(..) | Ident(..)
| Lifetime(..) | Interpolated(..) | Eof => false,
}
}

pub fn is_like_plus(&self) -> bool {
Expand Down
11 changes: 11 additions & 0 deletions compiler/rustc_ast/src/tokenstream.rs
Original file line number Diff line number Diff line change
Expand Up @@ -304,9 +304,20 @@ pub struct AttributesData {
#[derive(Clone, Debug, Default, Encodable, Decodable)]
pub struct TokenStream(pub(crate) Lrc<Vec<TokenTree>>);

/// Similar to `proc_macro::Spacing`, but for tokens.
///
/// Note that all `ast::TokenTree::Token` instances have a `Spacing`, but when
/// we convert to `proc_macro::TokenTree` for proc macros only `Punct`
/// `TokenTree`s have a `proc_macro::Spacing`.
#[derive(Clone, Copy, Debug, PartialEq, Encodable, Decodable, HashStable_Generic)]
pub enum Spacing {
/// The token is not immediately followed by an operator token (as
/// determined by `Token::is_op`). E.g. a `+` token is `Alone` in `+ =`,
/// `+/*foo*/=`, `+ident`, and `+()`.
Alone,

/// The token is immediately followed by an operator token. E.g. a `+`
/// token is `Joint` in `+=` and `++`.
Joint,
}

Expand Down
8 changes: 6 additions & 2 deletions compiler/rustc_expand/src/proc_macro_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -110,10 +110,14 @@ impl FromInternal<(TokenStream, &mut Rustc<'_, '_>)> for Vec<TokenTree<TokenStre
tokenstream::TokenTree::Token(token, spacing) => (token, spacing == Joint),
};

// Split the operator into one or more `Punct`s, one per character.
// The final one inherits the jointness of the original token. Any
// before that get `joint = true`.
let mut op = |s: &str| {
assert!(s.is_ascii());
trees.extend(s.as_bytes().iter().enumerate().map(|(idx, &ch)| {
TokenTree::Punct(Punct { ch, joint: joint || idx != s.len() - 1, span })
trees.extend(s.bytes().enumerate().map(|(idx, ch)| {
let is_final = idx == s.len() - 1;
TokenTree::Punct(Punct { ch, joint: if is_final { joint } else { true }, span })
}));
};

Expand Down
2 changes: 1 addition & 1 deletion compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ pub(crate) fn parse_token_trees<'a>(
let cursor = Cursor::new(src);
let string_reader =
StringReader { sess, start_pos, pos: start_pos, src, cursor, override_span };
tokentrees::TokenTreesReader::parse_token_trees(string_reader)
tokentrees::TokenTreesReader::parse_all_token_trees(string_reader)
}

struct StringReader<'a> {
Expand Down
102 changes: 38 additions & 64 deletions compiler/rustc_parse/src/lexer/tokentrees.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pub(super) struct TokenTreesReader<'a> {
}

impl<'a> TokenTreesReader<'a> {
pub(super) fn parse_token_trees(
pub(super) fn parse_all_token_trees(
string_reader: StringReader<'a>,
) -> (PResult<'a, TokenStream>, Vec<UnmatchedBrace>) {
let mut tt_reader = TokenTreesReader {
Expand All @@ -40,36 +40,51 @@ impl<'a> TokenTreesReader<'a> {
last_delim_empty_block_spans: FxHashMap::default(),
matching_block_spans: Vec::new(),
};
let res = tt_reader.parse_all_token_trees();
let res = tt_reader.parse_token_trees(/* is_delimited */ false);
(res, tt_reader.unmatched_braces)
}

// Parse a stream of tokens into a list of `TokenTree`s, up to an `Eof`.
fn parse_all_token_trees(&mut self) -> PResult<'a, TokenStream> {
// Parse a stream of tokens into a list of `TokenTree`s.
fn parse_token_trees(&mut self, is_delimited: bool) -> PResult<'a, TokenStream> {
self.token = self.string_reader.next_token().0;
let mut buf = TokenStreamBuilder::default();
let mut buf = Vec::new();
loop {
match self.token.kind {
token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)),
token::CloseDelim(delim) => return Err(self.close_delim_err(delim)),
token::Eof => return Ok(buf.into_token_stream()),
_ => buf.push(self.parse_token_tree_non_delim_non_eof()),
}
}
}

// Parse a stream of tokens into a list of `TokenTree`s, up to a `CloseDelim`.
fn parse_token_trees_until_close_delim(&mut self) -> TokenStream {
let mut buf = TokenStreamBuilder::default();
loop {
match self.token.kind {
token::OpenDelim(delim) => buf.push(self.parse_token_tree_open_delim(delim)),
token::CloseDelim(..) => return buf.into_token_stream(),
token::CloseDelim(delim) => {
return if is_delimited {
Ok(TokenStream::new(buf))
} else {
Err(self.close_delim_err(delim))
};
}
token::Eof => {
self.eof_err().emit();
return buf.into_token_stream();
if is_delimited {
self.eof_err().emit();
}
return Ok(TokenStream::new(buf));
}
_ => {
// Get the next normal token. This might require getting multiple adjacent
// single-char tokens and joining them together.
let (this_spacing, next_tok) = loop {
let (next_tok, is_next_tok_preceded_by_whitespace) =
self.string_reader.next_token();
if !is_next_tok_preceded_by_whitespace {
if let Some(glued) = self.token.glue(&next_tok) {
self.token = glued;
} else {
let this_spacing =
if next_tok.is_op() { Spacing::Joint } else { Spacing::Alone };
break (this_spacing, next_tok);
}
} else {
break (Spacing::Alone, next_tok);
}
};
let this_tok = std::mem::replace(&mut self.token, next_tok);
buf.push(TokenTree::Token(this_tok, this_spacing));
}
_ => buf.push(self.parse_token_tree_non_delim_non_eof()),
}
}
}
Expand Down Expand Up @@ -113,14 +128,12 @@ impl<'a> TokenTreesReader<'a> {
// The span for beginning of the delimited section
let pre_span = self.token.span;

// Move past the open delimiter.
self.open_braces.push((open_delim, self.token.span));
self.token = self.string_reader.next_token().0;

// Parse the token trees within the delimiters.
// We stop at any delimiter so we can try to recover if the user
// uses an incorrect delimiter.
let tts = self.parse_token_trees_until_close_delim();
let tts = self.parse_token_trees(/* is_delimited */ true).unwrap();

// Expand to cover the entire delimited token tree
let delim_span = DelimSpan::from_pair(pre_span, self.token.span);
Expand Down Expand Up @@ -242,43 +255,4 @@ impl<'a> TokenTreesReader<'a> {
err.span_label(self.token.span, "unexpected closing delimiter");
err
}

#[inline]
fn parse_token_tree_non_delim_non_eof(&mut self) -> TokenTree {
// `this_spacing` for the returned token refers to whether the token is
// immediately followed by another op token. It is determined by the
// next token: its kind and its `preceded_by_whitespace` status.
let (next_tok, is_next_tok_preceded_by_whitespace) = self.string_reader.next_token();
let this_spacing = if is_next_tok_preceded_by_whitespace || !next_tok.is_op() {
Spacing::Alone
} else {
Spacing::Joint
};
let this_tok = std::mem::replace(&mut self.token, next_tok);
TokenTree::Token(this_tok, this_spacing)
}
}

#[derive(Default)]
struct TokenStreamBuilder {
buf: Vec<TokenTree>,
}

impl TokenStreamBuilder {
#[inline(always)]
fn push(&mut self, tree: TokenTree) {
if let Some(TokenTree::Token(prev_token, Spacing::Joint)) = self.buf.last()
&& let TokenTree::Token(token, joint) = &tree
&& let Some(glued) = prev_token.glue(token)
{
self.buf.pop();
self.buf.push(TokenTree::Token(glued, *joint));
} else {
self.buf.push(tree)
}
}

fn into_token_stream(self) -> TokenStream {
TokenStream::new(self.buf)
}
}
6 changes: 5 additions & 1 deletion compiler/rustc_parse/src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,10 @@ impl TokenCursor {

fn desugar(&mut self, attr_style: AttrStyle, data: Symbol, span: Span) -> (Token, Spacing) {
// Searches for the occurrences of `"#*` and returns the minimum number of `#`s
// required to wrap the text.
// required to wrap the text. E.g.
// - `abc d` is wrapped as `r"abc d"` (num_of_hashes = 0)
// - `abc "d"` is wrapped as `r#"abc "d""#` (num_of_hashes = 1)
// - `abc "##d##"` is wrapped as `r###"abc "d""###` (num_of_hashes = 3)
let mut num_of_hashes = 0;
let mut count = 0;
for ch in data.as_str().chars() {
Expand All @@ -314,6 +317,7 @@ impl TokenCursor {
num_of_hashes = cmp::max(num_of_hashes, count);
}

// `/// foo` becomes `doc = r"foo".
let delim_span = DelimSpan::from_single(span);
let body = TokenTree::Delimited(
delim_span,
Expand Down