Skip to content

Commit a748526

Browse files
committed
[xt but not xc] Delay string literal unescaping.
Currently string literals are unescaped twice. - Once during lexing in `cook_quoted`/`cook_c_string`/`cook_common`. This one just checks for errors. - Again in `LitKind::from_token_lit`, which is called when lowering AST to HIR, and also in a few other places during expansion. This one actually constructs the unescaped string. It also has error checking code, but that part of the code is actually dead (and has several bugs) because the check during lexing catches all errors! This commit removes the error-check-only unescaping during lexing, and fixes up `LitKind::from_token_lit` so it properly does both checking and construction. This is a backwards-compatible language change: some programs now compile that previously did not. For example, it is now possible for macros to consume "invalid" string literals like "\a\b\c". This is a continuation of a trend of delaying semantic error checking of literals to after expansion: - rust-lang#102944 did this for some cases for numeric literals - The detection of NUL chars in C string literals is already delayed in this way. Notes about test changes: - `ignore-block-help.rs`: this requires a parse error for the test to work. The error used was an unescaping error, which is now delayed to after parsing. So the commit changes it to an "unterminated character literal" error which still occurs during parsing. - `tests/ui/lexer/error-stage.rs`: this shows the newly allowed cases, due to delayed literal unescaping. - Several tests had unescaping errors combined with unterminated literal errors. The former are now delayed but the latter remain as lexing errors. So the unterminated literal part needed to be split into a separate test file otherwise compilation would end before the other errors were reported. - issue-62913.rs: The structure and output changed a bit. Issue rust-lang#62913 was about an ICE due to an unterminated string literal, so the new version should be good enough. - literals-are-validated-before-expansion.rs: this tests exactly the behaviour that has been changed, and so was removed - A couple of other test produce the same errors, just in a different order.
1 parent 6f6d73b commit a748526

37 files changed

+615
-444
lines changed

compiler/rustc_ast/src/util/literal.rs

+194-76
Large diffs are not rendered by default.

compiler/rustc_ast_lowering/src/expr.rs

+7-8
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ use rustc_data_structures::stack::ensure_sufficient_stack;
1414
use rustc_hir as hir;
1515
use rustc_hir::def::{DefKind, Res};
1616
use rustc_middle::span_bug;
17-
use rustc_parse::parser::report_lit_error;
17+
use rustc_parse::parser::token_lit_to_lit_kind_and_report_errs;
1818
use rustc_span::source_map::{respan, Spanned};
1919
use rustc_span::symbol::{kw, sym, Ident, Symbol};
2020
use rustc_span::DUMMY_SP;
@@ -119,13 +119,12 @@ impl<'hir> LoweringContext<'_, 'hir> {
119119
hir::ExprKind::Unary(op, ohs)
120120
}
121121
ExprKind::Lit(token_lit) => {
122-
let lit_kind = match LitKind::from_token_lit(*token_lit) {
123-
Ok(lit_kind) => lit_kind,
124-
Err(err) => {
125-
report_lit_error(&self.tcx.sess.parse_sess, err, *token_lit, e.span);
126-
LitKind::Err
127-
}
128-
};
122+
let lit_kind = token_lit_to_lit_kind_and_report_errs(
123+
&self.tcx.sess.parse_sess,
124+
*token_lit,
125+
e.span,
126+
)
127+
.unwrap_or(LitKind::Err);
129128
let lit = self.arena.alloc(respan(self.lower_span(e.span), lit_kind));
130129
hir::ExprKind::Lit(lit)
131130
}

compiler/rustc_builtin_macros/src/concat.rs

+31-32
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use rustc_ast as ast;
22
use rustc_ast::tokenstream::TokenStream;
33
use rustc_expand::base::{self, DummyResult};
4-
use rustc_parse::parser::report_lit_error;
4+
use rustc_parse::parser::token_lit_to_lit_kind_and_report_errs;
55
use rustc_span::symbol::Symbol;
66

77
use crate::errors;
@@ -19,44 +19,43 @@ pub fn expand_concat(
1919
let mut has_errors = false;
2020
for e in es {
2121
match e.kind {
22-
ast::ExprKind::Lit(token_lit) => match ast::LitKind::from_token_lit(token_lit) {
23-
Ok(ast::LitKind::Str(s, _) | ast::LitKind::Float(s, _)) => {
24-
accumulator.push_str(s.as_str());
25-
}
26-
Ok(ast::LitKind::Char(c)) => {
27-
accumulator.push(c);
28-
}
29-
Ok(ast::LitKind::Int(i, _)) => {
30-
accumulator.push_str(&i.to_string());
31-
}
32-
Ok(ast::LitKind::Bool(b)) => {
33-
accumulator.push_str(&b.to_string());
34-
}
35-
Ok(ast::LitKind::CStr(..)) => {
36-
cx.emit_err(errors::ConcatCStrLit { span: e.span });
37-
has_errors = true;
38-
}
39-
Ok(ast::LitKind::Byte(..) | ast::LitKind::ByteStr(..)) => {
40-
cx.emit_err(errors::ConcatBytestr { span: e.span });
41-
has_errors = true;
42-
}
43-
Ok(ast::LitKind::Err) => {
44-
has_errors = true;
45-
}
46-
Err(err) => {
47-
report_lit_error(&cx.sess.parse_sess, err, token_lit, e.span);
48-
has_errors = true;
22+
ast::ExprKind::Lit(token_lit) => {
23+
match token_lit_to_lit_kind_and_report_errs(&cx.sess.parse_sess, token_lit, e.span)
24+
{
25+
Ok(ast::LitKind::Str(s, _) | ast::LitKind::Float(s, _)) => {
26+
accumulator.push_str(s.as_str());
27+
}
28+
Ok(ast::LitKind::Char(c)) => {
29+
accumulator.push(c);
30+
}
31+
Ok(ast::LitKind::Int(i, _)) => {
32+
accumulator.push_str(&i.to_string());
33+
}
34+
Ok(ast::LitKind::Bool(b)) => {
35+
accumulator.push_str(&b.to_string());
36+
}
37+
Ok(ast::LitKind::CStr(..)) => {
38+
cx.emit_err(errors::ConcatCStrLit { span: e.span });
39+
has_errors = true;
40+
}
41+
Ok(ast::LitKind::Byte(..) | ast::LitKind::ByteStr(..)) => {
42+
cx.emit_err(errors::ConcatBytestr { span: e.span });
43+
has_errors = true;
44+
}
45+
Ok(ast::LitKind::Err) | Err(()) => {
46+
has_errors = true;
47+
}
4948
}
50-
},
49+
}
5150
// We also want to allow negative numeric literals.
5251
ast::ExprKind::Unary(ast::UnOp::Neg, ref expr)
5352
if let ast::ExprKind::Lit(token_lit) = expr.kind =>
5453
{
55-
match ast::LitKind::from_token_lit(token_lit) {
54+
match token_lit_to_lit_kind_and_report_errs(&cx.sess.parse_sess, token_lit, e.span)
55+
{
5656
Ok(ast::LitKind::Int(i, _)) => accumulator.push_str(&format!("-{i}")),
5757
Ok(ast::LitKind::Float(f, _)) => accumulator.push_str(&format!("-{f}")),
58-
Err(err) => {
59-
report_lit_error(&cx.sess.parse_sess, err, token_lit, e.span);
58+
Err(()) => {
6059
has_errors = true;
6160
}
6261
_ => missing_literal.push(e.span),

compiler/rustc_builtin_macros/src/concat_bytes.rs

+3-5
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
use rustc_ast as ast;
22
use rustc_ast::{ptr::P, tokenstream::TokenStream};
33
use rustc_expand::base::{self, DummyResult};
4-
use rustc_parse::parser::report_lit_error;
4+
use rustc_parse::parser::token_lit_to_lit_kind_and_report_errs;
55
use rustc_span::Span;
66

77
use crate::errors;
@@ -17,7 +17,7 @@ fn invalid_type_err(
1717
ConcatBytesInvalid, ConcatBytesInvalidSuggestion, ConcatBytesNonU8, ConcatBytesOob,
1818
};
1919
let snippet = cx.sess.source_map().span_to_snippet(span).ok();
20-
match ast::LitKind::from_token_lit(token_lit) {
20+
match token_lit_to_lit_kind_and_report_errs(&cx.sess.parse_sess, token_lit, span) {
2121
Ok(ast::LitKind::CStr(_, _)) => {
2222
// Avoid ambiguity in handling of terminal `NUL` by refusing to
2323
// concatenate C string literals as bytes.
@@ -60,9 +60,7 @@ fn invalid_type_err(
6060
cx.emit_err(ConcatBytesNonU8 { span });
6161
}
6262
Ok(ast::LitKind::ByteStr(..) | ast::LitKind::Byte(_)) => unreachable!(),
63-
Err(err) => {
64-
report_lit_error(&cx.sess.parse_sess, err, token_lit, span);
65-
}
63+
Err(()) => {}
6664
}
6765
}
6866

compiler/rustc_expand/src/base.rs

+24-20
Original file line numberDiff line numberDiff line change
@@ -1235,26 +1235,30 @@ pub fn expr_to_spanned_string<'a>(
12351235
let expr = cx.expander().fully_expand_fragment(AstFragment::Expr(expr)).make_expr();
12361236

12371237
Err(match expr.kind {
1238-
ast::ExprKind::Lit(token_lit) => match ast::LitKind::from_token_lit(token_lit) {
1239-
Ok(ast::LitKind::Str(s, style)) => return Ok((s, style, expr.span)),
1240-
Ok(ast::LitKind::ByteStr(..)) => {
1241-
let mut err = cx.struct_span_err(expr.span, err_msg);
1242-
let span = expr.span.shrink_to_lo();
1243-
err.span_suggestion(
1244-
span.with_hi(span.lo() + BytePos(1)),
1245-
"consider removing the leading `b`",
1246-
"",
1247-
Applicability::MaybeIncorrect,
1248-
);
1249-
Some((err, true))
1250-
}
1251-
Ok(ast::LitKind::Err) => None,
1252-
Err(err) => {
1253-
parser::report_lit_error(&cx.sess.parse_sess, err, token_lit, expr.span);
1254-
None
1255-
}
1256-
_ => Some((cx.struct_span_err(expr.span, err_msg), false)),
1257-
},
1238+
ast::ExprKind::Lit(token_lit) => {
1239+
let res = match parser::token_lit_to_lit_kind_and_report_errs(
1240+
&cx.sess.parse_sess,
1241+
token_lit,
1242+
expr.span,
1243+
) {
1244+
Ok(ast::LitKind::Str(s, style)) => return Ok((s, style, expr.span)),
1245+
Ok(ast::LitKind::ByteStr(..)) => {
1246+
let mut err = cx.struct_span_err(expr.span, err_msg);
1247+
let span = expr.span.shrink_to_lo();
1248+
err.span_suggestion(
1249+
span.with_hi(span.lo() + BytePos(1)),
1250+
"consider removing the leading `b`",
1251+
"",
1252+
Applicability::MaybeIncorrect,
1253+
);
1254+
Some((err, true))
1255+
}
1256+
Ok(ast::LitKind::Err) => None,
1257+
Err(()) => None,
1258+
_ => Some((cx.struct_span_err(expr.span, err_msg), false)),
1259+
};
1260+
res
1261+
}
12581262
ast::ExprKind::Err => None,
12591263
_ => Some((cx.struct_span_err(expr.span, err_msg), false)),
12601264
})

compiler/rustc_lexer/src/unescape.rs

+1-1
Original file line numberDiff line numberDiff line change
@@ -347,7 +347,7 @@ where
347347
// them in the range computation.
348348
while let Some(c) = chars.next() {
349349
let start = src.len() - chars.as_str().len() - c.len_utf8();
350-
let res = match c {
350+
let res: Result<T, EscapeError> = match c {
351351
'\\' => {
352352
match chars.clone().next() {
353353
Some('\n') => {

compiler/rustc_parse/src/lexer/mod.rs

+12-79
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
use std::ops::Range;
2-
31
use crate::errors;
42
use crate::lexer::unicode_chars::UNICODE_ARRAY;
53
use crate::make_unclosed_delims_error;
@@ -8,7 +6,6 @@ use rustc_ast::token::{self, CommentKind, Delimiter, Token, TokenKind};
86
use rustc_ast::tokenstream::TokenStream;
97
use rustc_ast::util::unicode::contains_text_flow_control_chars;
108
use rustc_errors::{error_code, Applicability, Diagnostic, DiagnosticBuilder, StashKey};
11-
use rustc_lexer::unescape::{self, EscapeError, Mode};
129
use rustc_lexer::{Base, DocStyle, RawStrError};
1310
use rustc_lexer::{Cursor, LiteralKind};
1411
use rustc_session::lint::builtin::{
@@ -21,10 +18,10 @@ use rustc_span::{edition::Edition, BytePos, Pos, Span};
2118

2219
mod diagnostics;
2320
mod tokentrees;
24-
mod unescape_error_reporting;
21+
pub(crate) mod unescape_error_reporting;
2522
mod unicode_chars;
2623

27-
use unescape_error_reporting::{emit_unescape_error, escaped_char};
24+
use unescape_error_reporting::escaped_char;
2825

2926
// This type is used a lot. Make sure it doesn't unintentionally get bigger.
3027
//
@@ -409,7 +406,7 @@ impl<'a> StringReader<'a> {
409406
error_code!(E0762),
410407
)
411408
}
412-
self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' '
409+
self.cook_quoted(token::Char, start, end, 1, 1) // ' '
413410
}
414411
rustc_lexer::LiteralKind::Byte { terminated } => {
415412
if !terminated {
@@ -419,7 +416,7 @@ impl<'a> StringReader<'a> {
419416
error_code!(E0763),
420417
)
421418
}
422-
self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' '
419+
self.cook_quoted(token::Byte, start, end, 2, 1) // b' '
423420
}
424421
rustc_lexer::LiteralKind::Str { terminated } => {
425422
if !terminated {
@@ -429,7 +426,7 @@ impl<'a> StringReader<'a> {
429426
error_code!(E0765),
430427
)
431428
}
432-
self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " "
429+
self.cook_quoted(token::Str, start, end, 1, 1) // " "
433430
}
434431
rustc_lexer::LiteralKind::ByteStr { terminated } => {
435432
if !terminated {
@@ -439,7 +436,7 @@ impl<'a> StringReader<'a> {
439436
error_code!(E0766),
440437
)
441438
}
442-
self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" "
439+
self.cook_quoted(token::ByteStr, start, end, 2, 1) // b" "
443440
}
444441
rustc_lexer::LiteralKind::CStr { terminated } => {
445442
if !terminated {
@@ -449,13 +446,13 @@ impl<'a> StringReader<'a> {
449446
error_code!(E0767),
450447
)
451448
}
452-
self.cook_c_string(token::CStr, Mode::CStr, start, end, 2, 1) // c" "
449+
self.cook_quoted(token::CStr, start, end, 2, 1) // c" "
453450
}
454451
rustc_lexer::LiteralKind::RawStr { n_hashes } => {
455452
if let Some(n_hashes) = n_hashes {
456453
let n = u32::from(n_hashes);
457454
let kind = token::StrRaw(n_hashes);
458-
self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "##
455+
self.cook_quoted(kind, start, end, 2 + n, 1 + n) // r##" "##
459456
} else {
460457
self.report_raw_str_error(start, 1);
461458
}
@@ -464,7 +461,7 @@ impl<'a> StringReader<'a> {
464461
if let Some(n_hashes) = n_hashes {
465462
let n = u32::from(n_hashes);
466463
let kind = token::ByteStrRaw(n_hashes);
467-
self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "##
464+
self.cook_quoted(kind, start, end, 3 + n, 1 + n) // br##" "##
468465
} else {
469466
self.report_raw_str_error(start, 2);
470467
}
@@ -473,7 +470,7 @@ impl<'a> StringReader<'a> {
473470
if let Some(n_hashes) = n_hashes {
474471
let n = u32::from(n_hashes);
475472
let kind = token::CStrRaw(n_hashes);
476-
self.cook_c_string(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "##
473+
self.cook_quoted(kind, start, end, 3 + n, 1 + n) // cr##" "##
477474
} else {
478475
self.report_raw_str_error(start, 2);
479476
}
@@ -693,82 +690,18 @@ impl<'a> StringReader<'a> {
693690
self.sess.emit_fatal(errors::TooManyHashes { span: self.mk_sp(start, self.pos), num });
694691
}
695692

696-
fn cook_common(
693+
fn cook_quoted(
697694
&self,
698695
kind: token::LitKind,
699-
mode: Mode,
700696
start: BytePos,
701697
end: BytePos,
702698
prefix_len: u32,
703699
postfix_len: u32,
704-
unescape: fn(&str, Mode, &mut dyn FnMut(Range<usize>, Result<(), EscapeError>)),
705700
) -> (token::LitKind, Symbol) {
706-
let mut has_fatal_err = false;
707701
let content_start = start + BytePos(prefix_len);
708702
let content_end = end - BytePos(postfix_len);
709703
let lit_content = self.str_from_to(content_start, content_end);
710-
unescape(lit_content, mode, &mut |range, result| {
711-
// Here we only check for errors. The actual unescaping is done later.
712-
if let Err(err) = result {
713-
let span_with_quotes = self.mk_sp(start, end);
714-
let (start, end) = (range.start as u32, range.end as u32);
715-
let lo = content_start + BytePos(start);
716-
let hi = lo + BytePos(end - start);
717-
let span = self.mk_sp(lo, hi);
718-
if err.is_fatal() {
719-
has_fatal_err = true;
720-
}
721-
emit_unescape_error(
722-
&self.sess.span_diagnostic,
723-
lit_content,
724-
span_with_quotes,
725-
span,
726-
mode,
727-
range,
728-
err,
729-
);
730-
}
731-
});
732-
733-
// We normally exclude the quotes for the symbol, but for errors we
734-
// include it because it results in clearer error messages.
735-
if !has_fatal_err {
736-
(kind, Symbol::intern(lit_content))
737-
} else {
738-
(token::Err, self.symbol_from_to(start, end))
739-
}
740-
}
741-
742-
fn cook_quoted(
743-
&self,
744-
kind: token::LitKind,
745-
mode: Mode,
746-
start: BytePos,
747-
end: BytePos,
748-
prefix_len: u32,
749-
postfix_len: u32,
750-
) -> (token::LitKind, Symbol) {
751-
self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
752-
unescape::unescape_literal(src, mode, &mut |span, result| {
753-
callback(span, result.map(drop))
754-
})
755-
})
756-
}
757-
758-
fn cook_c_string(
759-
&self,
760-
kind: token::LitKind,
761-
mode: Mode,
762-
start: BytePos,
763-
end: BytePos,
764-
prefix_len: u32,
765-
postfix_len: u32,
766-
) -> (token::LitKind, Symbol) {
767-
self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
768-
unescape::unescape_c_string(src, mode, &mut |span, result| {
769-
callback(span, result.map(drop))
770-
})
771-
})
704+
(kind, Symbol::intern(lit_content))
772705
}
773706
}
774707

0 commit comments

Comments
 (0)