Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove trivia tokens #76170

Merged
merged 3 commits into from
Sep 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 3 additions & 15 deletions compiler/rustc_ast/src/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,17 +251,6 @@ pub enum TokenKind {
/// similarly to symbols in string literal tokens.
DocComment(CommentKind, ast::AttrStyle, Symbol),

// Junk. These carry no data because we don't really care about the data
// they *would* carry, and don't really want to allocate a new ident for
// them. Instead, users could extract that from the associated span.
/// Whitespace.
Whitespace,
/// A comment.
Comment,
Shebang(Symbol),
/// A completely invalid token which should be skipped.
Unknown(Symbol),

Eof,
}

Expand Down Expand Up @@ -331,7 +320,7 @@ impl Token {

/// Some token that will be thrown away later.
pub fn dummy() -> Self {
Token::new(TokenKind::Whitespace, DUMMY_SP)
Token::new(TokenKind::Question, DUMMY_SP)
}

/// Recovers a `Token` from an `Ident`. This creates a raw identifier if necessary.
Expand Down Expand Up @@ -360,7 +349,7 @@ impl Token {
pub fn is_op(&self) -> bool {
match self.kind {
OpenDelim(..) | CloseDelim(..) | Literal(..) | DocComment(..) | Ident(..)
| Lifetime(..) | Interpolated(..) | Whitespace | Comment | Shebang(..) | Eof => false,
| Lifetime(..) | Interpolated(..) | Eof => false,
_ => true,
}
}
Expand Down Expand Up @@ -676,8 +665,7 @@ impl Token {
Le | EqEq | Ne | Ge | AndAnd | OrOr | Tilde | BinOpEq(..) | At | DotDotDot
| DotDotEq | Comma | Semi | ModSep | RArrow | LArrow | FatArrow | Pound | Dollar
| Question | OpenDelim(..) | CloseDelim(..) | Literal(..) | Ident(..)
| Lifetime(..) | Interpolated(..) | DocComment(..) | Whitespace | Comment
| Shebang(..) | Unknown(..) | Eof => return None,
| Lifetime(..) | Interpolated(..) | DocComment(..) | Eof => return None,
};

Some(Token::new(kind, self.span.to(joint.span)))
Expand Down
4 changes: 0 additions & 4 deletions compiler/rustc_ast_pretty/src/pprust.rs
Original file line number Diff line number Diff line change
Expand Up @@ -289,10 +289,6 @@ fn token_kind_to_string_ext(tok: &TokenKind, convert_dollar_crate: Option<Span>)
doc_comment_to_string(comment_kind, attr_style, data)
}
token::Eof => "<eof>".to_string(),
token::Whitespace => " ".to_string(),
token::Comment => "/* */".to_string(),
token::Shebang(s) => format!("/* shebang: {}*/", s),
token::Unknown(s) => s.to_string(),

token::Interpolated(ref nt) => nonterminal_to_string(nt),
}
Expand Down
2 changes: 1 addition & 1 deletion compiler/rustc_expand/src/proc_macro_server.rs
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,7 @@ impl FromInternal<(TreeAndJoint, &'_ ParseSess, &'_ mut Vec<Self>)>
}

OpenDelim(..) | CloseDelim(..) => unreachable!(),
Whitespace | Comment | Shebang(..) | Unknown(..) | Eof => unreachable!(),
Eof => unreachable!(),
}
}
}
Expand Down
113 changes: 53 additions & 60 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use rustc_ast::ast::AttrStyle;
use rustc_ast::token::{self, CommentKind, Token, TokenKind};
use rustc_ast::tokenstream::IsJoint;
use rustc_data_structures::sync::Lrc;
use rustc_errors::{error_code, Applicability, DiagnosticBuilder, FatalError};
use rustc_lexer::Base;
Expand Down Expand Up @@ -65,42 +66,46 @@ impl<'a> StringReader<'a> {
self.override_span.unwrap_or_else(|| Span::with_root_ctxt(lo, hi))
}

/// Returns the next token, including trivia like whitespace or comments.
fn next_token(&mut self) -> Token {
/// Returns the next token, and info about preceding whitespace, if any.
fn next_token(&mut self) -> (IsJoint, Token) {
let mut is_joint = IsJoint::Joint;

// Skip `#!` at the start of the file
let start_src_index = self.src_index(self.pos);
let text: &str = &self.src[start_src_index..self.end_src_index];

if text.is_empty() {
let span = self.mk_sp(self.pos, self.pos);
return Token::new(token::Eof, span);
let is_beginning_of_file = self.pos == self.start_pos;
if is_beginning_of_file {
if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
self.pos = self.pos + BytePos::from_usize(shebang_len);
is_joint = IsJoint::NonJoint;
}
}

{
let is_beginning_of_file = self.pos == self.start_pos;
if is_beginning_of_file {
if let Some(shebang_len) = rustc_lexer::strip_shebang(text) {
let start = self.pos;
self.pos = self.pos + BytePos::from_usize(shebang_len);
// Skip trivial (whitespace & comments) tokens
loop {
let start_src_index = self.src_index(self.pos);
let text: &str = &self.src[start_src_index..self.end_src_index];

let sym = self.symbol_from(start + BytePos::from_usize("#!".len()));
let kind = token::Shebang(sym);

let span = self.mk_sp(start, self.pos);
return Token::new(kind, span);
}
if text.is_empty() {
let span = self.mk_sp(self.pos, self.pos);
return (is_joint, Token::new(token::Eof, span));
}
}

let token = rustc_lexer::first_token(text);
let token = rustc_lexer::first_token(text);

let start = self.pos;
self.pos = self.pos + BytePos::from_usize(token.len);
let start = self.pos;
self.pos = self.pos + BytePos::from_usize(token.len);

debug!("try_next_token: {:?}({:?})", token.kind, self.str_from(start));
debug!("next_token: {:?}({:?})", token.kind, self.str_from(start));

let kind = self.cook_lexer_token(token.kind, start);
let span = self.mk_sp(start, self.pos);
Token::new(kind, span)
match self.cook_lexer_token(token.kind, start) {
Some(kind) => {
let span = self.mk_sp(start, self.pos);
return (is_joint, Token::new(kind, span));
}
None => is_joint = IsJoint::NonJoint,
}
}
}

/// Report a fatal lexical error with a given span.
Expand Down Expand Up @@ -140,19 +145,16 @@ impl<'a> StringReader<'a> {
/// Turns simple `rustc_lexer::TokenKind` enum into a rich
/// `librustc_ast::TokenKind`. This turns strings into interned
/// symbols and runs additional validation.
fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> TokenKind {
match token {
fn cook_lexer_token(&self, token: rustc_lexer::TokenKind, start: BytePos) -> Option<TokenKind> {
Some(match token {
rustc_lexer::TokenKind::LineComment { doc_style } => {
match doc_style {
Some(doc_style) => {
// Opening delimiter of the length 3 is not included into the symbol.
let content_start = start + BytePos(3);
let content = self.str_from(content_start);
// Skip non-doc comments
let doc_style = doc_style?;

self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
}
None => token::Comment,
}
// Opening delimiter of the length 3 is not included into the symbol.
let content_start = start + BytePos(3);
let content = self.str_from(content_start);
self.cook_doc_comment(content_start, content, CommentKind::Line, doc_style)
}
rustc_lexer::TokenKind::BlockComment { doc_style, terminated } => {
if !terminated {
Expand All @@ -171,20 +173,18 @@ impl<'a> StringReader<'a> {
.emit();
FatalError.raise();
}
match doc_style {
Some(doc_style) => {
// Opening delimiter of the length 3 and closing delimiter of the length 2
// are not included into the symbol.
let content_start = start + BytePos(3);
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
let content = self.str_from_to(content_start, content_end);

self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
}
None => token::Comment,
}

// Skip non-doc comments
let doc_style = doc_style?;

// Opening delimiter of the length 3 and closing delimiter of the length 2
// are not included into the symbol.
let content_start = start + BytePos(3);
let content_end = self.pos - BytePos(if terminated { 2 } else { 0 });
let content = self.str_from_to(content_start, content_end);
self.cook_doc_comment(content_start, content, CommentKind::Block, doc_style)
}
rustc_lexer::TokenKind::Whitespace => token::Whitespace,
rustc_lexer::TokenKind::Whitespace => return None,
rustc_lexer::TokenKind::Ident | rustc_lexer::TokenKind::RawIdent => {
let is_raw_ident = token == rustc_lexer::TokenKind::RawIdent;
let mut ident_start = start;
Expand Down Expand Up @@ -282,12 +282,11 @@ impl<'a> StringReader<'a> {
// this should be inside `rustc_lexer`. However, we should first remove compound
// tokens like `<<` from `rustc_lexer`, and then add fancier error recovery to it,
// as there will be less overall work to do this way.
let token = unicode_chars::check_for_substitution(self, start, c, &mut err)
.unwrap_or_else(|| token::Unknown(self.symbol_from(start)));
let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
err.emit();
token
token?
}
}
})
}

fn cook_doc_comment(
Expand Down Expand Up @@ -450,12 +449,6 @@ impl<'a> StringReader<'a> {
self.str_from_to(start, self.pos)
}

/// Creates a Symbol from a given offset to the current offset.
fn symbol_from(&self, start: BytePos) -> Symbol {
debug!("taking an ident from {:?} to {:?}", start, self.pos);
Symbol::intern(self.str_from(start))
}

/// As symbol_from, with an explicit endpoint.
fn symbol_from_to(&self, start: BytePos, end: BytePos) -> Symbol {
debug!("taking an ident from {:?} to {:?}", start, end);
Expand Down
36 changes: 13 additions & 23 deletions compiler/rustc_parse/src/lexer/tokentrees.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@ impl<'a> StringReader<'a> {
let mut tt_reader = TokenTreesReader {
string_reader: self,
token: Token::dummy(),
joint_to_prev: Joint,
open_braces: Vec::new(),
unmatched_braces: Vec::new(),
matching_delim_spans: Vec::new(),
Expand All @@ -32,7 +31,6 @@ impl<'a> StringReader<'a> {
struct TokenTreesReader<'a> {
string_reader: StringReader<'a>,
token: Token,
joint_to_prev: IsJoint,
/// Stack of open delimiters and their spans. Used for error message.
open_braces: Vec<(token::DelimToken, Span)>,
unmatched_braces: Vec<UnmatchedBrace>,
Expand All @@ -53,7 +51,7 @@ impl<'a> TokenTreesReader<'a> {
fn parse_all_token_trees(&mut self) -> PResult<'a, TokenStream> {
let mut buf = TokenStreamBuilder::default();

self.real_token();
self.bump();
while self.token != token::Eof {
buf.push(self.parse_token_tree()?);
}
Expand Down Expand Up @@ -126,7 +124,7 @@ impl<'a> TokenTreesReader<'a> {

// Parse the open delimiter.
self.open_braces.push((delim, self.token.span));
self.real_token();
self.bump();

// Parse the token trees within the delimiters.
// We stop at any delimiter so we can try to recover if the user
Expand Down Expand Up @@ -171,7 +169,7 @@ impl<'a> TokenTreesReader<'a> {
));
}
// Parse the closing delimiter.
self.real_token();
self.bump();
}
// Incorrect delimiter.
token::CloseDelim(other) => {
Expand Down Expand Up @@ -217,7 +215,7 @@ impl<'a> TokenTreesReader<'a> {
// bar(baz(
// } // Incorrect delimiter but matches the earlier `{`
if !self.open_braces.iter().any(|&(b, _)| b == other) {
self.real_token();
self.bump();
}
}
token::Eof => {
Expand Down Expand Up @@ -264,27 +262,19 @@ impl<'a> TokenTreesReader<'a> {
}
_ => {
let tt = TokenTree::Token(self.token.take());
self.real_token();
let is_joint = self.joint_to_prev == Joint && self.token.is_op();
Ok((tt, if is_joint { Joint } else { NonJoint }))
let mut is_joint = self.bump();
if !self.token.is_op() {
is_joint = NonJoint;
}
Comment on lines +266 to +268
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I originally forgot this condition, and that failed some proc-macro tests. This is a bit weird -- I would expect this jointness censoring to happen at the proc_macro_srv layer, and not here.

Copy link
Contributor

@petrochenkov petrochenkov Sep 1, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, this check should be moved to proc macro server.
Also lexer should produce the jointness flag for the "delimited group" token trees as well (the match branches above). (Two of such flags, actually, for both the opening and the closing delimiter.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should I do this in this PR, or a separate one?

I've also rebased #75528 on top of this PR, seems to work

matklad@21bf6ce

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should I do this in this PR, or a separate one?

Whichever is more convenient.

Ok((tt, is_joint))
}
}
}

fn real_token(&mut self) {
self.joint_to_prev = Joint;
loop {
let token = self.string_reader.next_token();
match token.kind {
token::Whitespace | token::Comment | token::Shebang(_) | token::Unknown(_) => {
self.joint_to_prev = NonJoint;
}
_ => {
self.token = token;
return;
}
}
}
fn bump(&mut self) -> IsJoint {
let (joint_to_prev, token) = self.string_reader.next_token();
self.token = token;
joint_to_prev
}
}

Expand Down
2 changes: 1 addition & 1 deletion compiler/rustc_parse/src/lexer/unicode_chars.rs
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ const UNICODE_ARRAY: &[(char, &str, char)] = &[
// However, we should first remove compound tokens like `<<` from `rustc_lexer`, and then add
// fancier error recovery to it, as there will be less overall work to do this way.
const ASCII_ARRAY: &[(char, &str, Option<token::TokenKind>)] = &[
(' ', "Space", Some(token::Whitespace)),
(' ', "Space", None),
('_', "Underscore", Some(token::Ident(kw::Underscore, false))),
('-', "Minus/Hyphen", Some(token::BinOp(token::Minus))),
(',', "Comma", Some(token::Comma)),
Expand Down
7 changes: 0 additions & 7 deletions compiler/rustc_parse/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -348,9 +348,6 @@ pub fn tokenstream_probably_equal_for_proc_macro(
| token::CloseDelim(DelimToken::NoDelim)
// The pretty printer collapses many semicolons into one.
| token::Semi
// The pretty printer collapses whitespace arbitrarily and can
// introduce whitespace from `NoDelim`.
| token::Whitespace
// The pretty printer can turn `$crate` into `::crate_name`
| token::ModSep = token.kind {
return false;
Expand Down Expand Up @@ -506,8 +503,6 @@ fn token_probably_equal_for_proc_macro(first: &Token, other: &Token) -> bool {
| (&Pound, &Pound)
| (&Dollar, &Dollar)
| (&Question, &Question)
| (&Whitespace, &Whitespace)
| (&Comment, &Comment)
| (&Eof, &Eof) => true,

(&BinOp(a), &BinOp(b)) | (&BinOpEq(a), &BinOpEq(b)) => a == b,
Expand All @@ -516,8 +511,6 @@ fn token_probably_equal_for_proc_macro(first: &Token, other: &Token) -> bool {

(&DocComment(a1, a2, a3), &DocComment(b1, b2, b3)) => a1 == b1 && a2 == b2 && a3 == b3,

(&Shebang(a), &Shebang(b)) => a == b,

(&Literal(a), &Literal(b)) => a == b,

(&Lifetime(a), &Lifetime(b)) => a == b,
Expand Down