Rollup merge of #133070 - nnethercote:lexer-tweaks, r=chenyukang

Lexer tweaks Some cleanups and small performance improvements. r? ```@chenyukang```
rust-lang · Nov 26, 2024 · 9d6a11a · 9d6a11a
2 parents 5915190 + 16a39bb
commit 9d6a11a
Show file tree

Hide file tree

Showing 5 changed files with 121 additions and 125 deletions.
diff --git a/compiler/rustc_lexer/src/lib.rs b/compiler/rustc_lexer/src/lib.rs
@@ -566,19 +566,19 @@ impl Cursor<'_> {
 
     fn c_or_byte_string(
         &mut self,
-        mk_kind: impl FnOnce(bool) -> LiteralKind,
-        mk_kind_raw: impl FnOnce(Option<u8>) -> LiteralKind,
+        mk_kind: fn(bool) -> LiteralKind,
+        mk_kind_raw: fn(Option<u8>) -> LiteralKind,
         single_quoted: Option<fn(bool) -> LiteralKind>,
     ) -> TokenKind {
         match (self.first(), self.second(), single_quoted) {
-            ('\'', _, Some(mk_kind)) => {
+            ('\'', _, Some(single_quoted)) => {
                 self.bump();
                 let terminated = self.single_quoted_string();
                 let suffix_start = self.pos_within_token();
                 if terminated {
                     self.eat_literal_suffix();
                 }
-                let kind = mk_kind(terminated);
+                let kind = single_quoted(terminated);
                 Literal { kind, suffix_start }
             }
             ('"', _, _) => {

diff --git a/compiler/rustc_lexer/src/tests.rs b/compiler/rustc_lexer/src/tests.rs
@@ -77,61 +77,51 @@ fn test_too_many_hashes() {
     check_raw_str(&s2, Err(RawStrError::TooManyDelimiters { found: u32::from(max_count) + 1 }));
 }
 
+// https://github.com/rust-lang/rust/issues/70528
 #[test]
 fn test_valid_shebang() {
-    // https://github.com/rust-lang/rust/issues/70528
-    let input = "#!/usr/bin/rustrun\nlet x = 5;";
-    assert_eq!(strip_shebang(input), Some(18));
-}
+    let input = "#!/bin/bash";
+    assert_eq!(strip_shebang(input), Some(input.len()));
 
-#[test]
-fn test_invalid_shebang_valid_rust_syntax() {
-    // https://github.com/rust-lang/rust/issues/70528
-    let input = "#!    [bad_attribute]";
+    let input = "#![attribute]";
     assert_eq!(strip_shebang(input), None);
-}
 
-#[test]
-fn test_shebang_second_line() {
-    // Because shebangs are interpreted by the kernel, they must be on the first line
-    let input = "\n#!/bin/bash";
+    let input = "#!    /bin/bash";
+    assert_eq!(strip_shebang(input), Some(input.len()));
+
+    let input = "#!    [attribute]";
     assert_eq!(strip_shebang(input), None);
-}
 
-#[test]
-fn test_shebang_space() {
-    let input = "#!    /bin/bash";
+    let input = "#! /* blah */  /bin/bash";
     assert_eq!(strip_shebang(input), Some(input.len()));
-}
 
-#[test]
-fn test_shebang_empty_shebang() {
-    let input = "#!    \n[attribute(foo)]";
+    let input = "#! /* blah */  [attribute]";
     assert_eq!(strip_shebang(input), None);
-}
 
-#[test]
-fn test_invalid_shebang_comment() {
-    let input = "#!//bin/ami/a/comment\n[";
-    assert_eq!(strip_shebang(input), None)
-}
+    let input = "#! // blah\n/bin/bash";
+    assert_eq!(strip_shebang(input), Some(10)); // strip up to the newline
 
-#[test]
-fn test_invalid_shebang_another_comment() {
-    let input = "#!/*bin/ami/a/comment*/\n[attribute";
-    assert_eq!(strip_shebang(input), None)
-}
+    let input = "#! // blah\n[attribute]";
+    assert_eq!(strip_shebang(input), None);
 
-#[test]
-fn test_shebang_valid_rust_after() {
-    let input = "#!/*bin/ami/a/comment*/\npub fn main() {}";
-    assert_eq!(strip_shebang(input), Some(23))
-}
+    let input = "#! /* blah\nblah\nblah */  /bin/bash";
+    assert_eq!(strip_shebang(input), Some(10));
 
-#[test]
-fn test_shebang_followed_by_attrib() {
-    let input = "#!/bin/rust-scripts\n#![allow_unused(true)]";
-    assert_eq!(strip_shebang(input), Some(19));
+    let input = "#! /* blah\nblah\nblah */  [attribute]";
+    assert_eq!(strip_shebang(input), None);
+
+    let input = "#!\n/bin/sh";
+    assert_eq!(strip_shebang(input), Some(2));
+
+    let input = "#!\n[attribute]";
+    assert_eq!(strip_shebang(input), None);
+
+    // Because shebangs are interpreted by the kernel, they must be on the first line
+    let input = "\n#!/bin/bash";
+    assert_eq!(strip_shebang(input), None);
+
+    let input = "\n#![attribute]";
+    assert_eq!(strip_shebang(input), None);
 }
 
 fn check_lexing(src: &str, expect: Expect) {

diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
@@ -18,6 +18,7 @@ use rustc_span::symbol::Symbol;
 use rustc_span::{BytePos, Pos, Span};
 use tracing::debug;
 
+use crate::lexer::diagnostics::TokenTreeDiagInfo;
 use crate::lexer::unicode_chars::UNICODE_ARRAY;
 use crate::{errors, make_unclosed_delims_error};
 
@@ -56,7 +57,7 @@ pub(crate) fn lex_token_trees<'psess, 'src>(
     }
 
     let cursor = Cursor::new(src);
-    let string_reader = StringReader {
+    let mut lexer = Lexer {
         psess,
         start_pos,
         pos: start_pos,
@@ -65,34 +66,31 @@ pub(crate) fn lex_token_trees<'psess, 'src>(
         override_span,
         nbsp_is_whitespace: false,
         last_lifetime: None,
+        token: Token::dummy(),
+        diag_info: TokenTreeDiagInfo::default(),
     };
-    let (stream, res, unmatched_delims) =
-        tokentrees::TokenTreesReader::lex_all_token_trees(string_reader);
-    match res {
-        Ok(()) if unmatched_delims.is_empty() => Ok(stream),
-        _ => {
-            // Return error if there are unmatched delimiters or unclosed delimiters.
-            // We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch
-            // because the delimiter mismatch is more likely to be the root cause of error
-
-            let mut buffer = Vec::with_capacity(1);
-            for unmatched in unmatched_delims {
-                if let Some(err) = make_unclosed_delims_error(unmatched, psess) {
-                    buffer.push(err);
-                }
-            }
-            if let Err(errs) = res {
-                // Add unclosing delimiter or diff marker errors
-                for err in errs {
-                    buffer.push(err);
-                }
-            }
-            Err(buffer)
+    let (_open_spacing, stream, res) = lexer.lex_token_trees(/* is_delimited */ false);
+    let unmatched_delims = lexer.diag_info.unmatched_delims;
+
+    if res.is_ok() && unmatched_delims.is_empty() {
+        Ok(stream)
+    } else {
+        // Return error if there are unmatched delimiters or unclosed delimiters.
+        // We emit delimiter mismatch errors first, then emit the unclosing delimiter mismatch
+        // because the delimiter mismatch is more likely to be the root cause of error
+        let mut buffer: Vec<_> = unmatched_delims
+            .into_iter()
+            .filter_map(|unmatched_delim| make_unclosed_delims_error(unmatched_delim, psess))
+            .collect();
+        if let Err(errs) = res {
+            // Add unclosing delimiter or diff marker errors
+            buffer.extend(errs);
         }
+        Err(buffer)
     }
 }
 
-struct StringReader<'psess, 'src> {
+struct Lexer<'psess, 'src> {
     psess: &'psess ParseSess,
     /// Initial position, read-only.
     start_pos: BytePos,
@@ -111,9 +109,14 @@ struct StringReader<'psess, 'src> {
     /// Track the `Span` for the leading `'` of the last lifetime. Used for
     /// diagnostics to detect possible typo where `"` was meant.
     last_lifetime: Option<Span>,
+
+    /// The current token.
+    token: Token,
+
+    diag_info: TokenTreeDiagInfo,
 }
 
-impl<'psess, 'src> StringReader<'psess, 'src> {
+impl<'psess, 'src> Lexer<'psess, 'src> {
     fn dcx(&self) -> DiagCtxtHandle<'psess> {
         self.psess.dcx()
     }
@@ -124,7 +127,7 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
 
     /// Returns the next token, paired with a bool indicating if the token was
     /// preceded by whitespace.
-    fn next_token(&mut self) -> (Token, bool) {
+    fn next_token_from_cursor(&mut self) -> (Token, bool) {
         let mut preceded_by_whitespace = false;
         let mut swallow_next_invalid = 0;
         // Skip trivial (whitespace & comments) tokens
@@ -231,7 +234,8 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
                         .push(span);
                     token::Ident(sym, IdentIsRaw::No)
                 }
-                // split up (raw) c string literals to an ident and a string literal when edition < 2021.
+                // split up (raw) c string literals to an ident and a string literal when edition <
+                // 2021.
                 rustc_lexer::TokenKind::Literal {
                     kind: kind @ (LiteralKind::CStr { .. } | LiteralKind::RawCStr { .. }),
                     suffix_start: _,
@@ -252,7 +256,9 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
                     let prefix_span = self.mk_sp(start, lit_start);
                     return (Token::new(self.ident(start), prefix_span), preceded_by_whitespace);
                 }
-                rustc_lexer::TokenKind::GuardedStrPrefix => self.maybe_report_guarded_str(start, str_before),
+                rustc_lexer::TokenKind::GuardedStrPrefix => {
+                    self.maybe_report_guarded_str(start, str_before)
+                }
                 rustc_lexer::TokenKind::Literal { kind, suffix_start } => {
                     let suffix_start = start + BytePos(suffix_start);
                     let (kind, symbol) = self.cook_lexer_literal(start, suffix_start, kind);
@@ -296,13 +302,20 @@ impl<'psess, 'src> StringReader<'psess, 'src> {
                     if prefix_span.at_least_rust_2021() {
                         let span = self.mk_sp(start, self.pos);
 
-                        let lifetime_name_without_tick = Symbol::intern(&self.str_from(ident_start));
+                        let lifetime_name_without_tick =
+                            Symbol::intern(&self.str_from(ident_start));
                         if !lifetime_name_without_tick.can_be_raw() {
-                            self.dcx().emit_err(errors::CannotBeRawLifetime { span, ident: lifetime_name_without_tick });
+                            self.dcx().emit_err(
+                                errors::CannotBeRawLifetime {
+                                    span,
+                                    ident: lifetime_name_without_tick
+                                }
+                            );
                         }
 
                         // Put the `'` back onto the lifetime name.
-                        let mut lifetime_name = String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
+                        let mut lifetime_name =
+                            String::with_capacity(lifetime_name_without_tick.as_str().len() + 1);
                         lifetime_name.push('\'');
                         lifetime_name += lifetime_name_without_tick.as_str();
                         let sym = Symbol::intern(&lifetime_name);