Skip to content

Improve heuristics whether format_args string is a source literal #106195

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 51 additions & 5 deletions compiler/rustc_parse_format/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ pub use Flag::*;
pub use Piece::*;
pub use Position::*;

use rustc_lexer::unescape;
use std::iter;
use std::str;
use std::string;
Expand Down Expand Up @@ -56,6 +57,13 @@ impl InnerWidthMapping {
}
}

/// Whether the input string is a literal. If yes, it contains the inner width mappings.
#[derive(Clone, PartialEq, Eq)]
enum InputStringKind {
NotALiteral,
Literal { width_mappings: Vec<InnerWidthMapping> },
}

/// The type of format string that we are parsing.
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub enum ParseMode {
Expand Down Expand Up @@ -306,7 +314,11 @@ impl<'a> Parser<'a> {
append_newline: bool,
mode: ParseMode,
) -> Parser<'a> {
let (width_map, is_literal) = find_width_map_from_snippet(snippet, style);
let input_string_kind = find_width_map_from_snippet(s, snippet, style);
let (width_map, is_literal) = match input_string_kind {
InputStringKind::Literal { width_mappings } => (width_mappings, true),
InputStringKind::NotALiteral => (Vec::new(), false),
};
Parser {
mode,
input: s,
Expand Down Expand Up @@ -844,20 +856,40 @@ impl<'a> Parser<'a> {
/// written code (code snippet) and the `InternedString` that gets processed in the `Parser`
/// in order to properly synthesise the intra-string `Span`s for error diagnostics.
fn find_width_map_from_snippet(
input: &str,
snippet: Option<string::String>,
str_style: Option<usize>,
) -> (Vec<InnerWidthMapping>, bool) {
) -> InputStringKind {
let snippet = match snippet {
Some(ref s) if s.starts_with('"') || s.starts_with("r\"") || s.starts_with("r#") => s,
_ => return (vec![], false),
_ => return InputStringKind::NotALiteral,
};

if str_style.is_some() {
return (vec![], true);
return InputStringKind::Literal { width_mappings: Vec::new() };
}

// Strip quotes.
let snippet = &snippet[1..snippet.len() - 1];

// Macros like `println` add a newline at the end. That technically doens't make them "literals" anymore, but it's fine
// since we will never need to point our spans there, so we lie about it here by ignoring it.
// Since there might actually be newlines in the source code, we need to normalize away all trailing newlines.
// If we only trimmed it off the input, `format!("\n")` would cause a mismatch as here we they actually match up.
// Alternatively, we could just count the trailing newlines and only trim one from the input if they don't match up.
let input_no_nl = input.trim_end_matches('\n');
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we only want to trim only one \n off the literal to deal with the println! case, then check strict literal equality? Or am I missing something more subtle here?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are just like I was when I tried this at first.
If we have format!("uwu\n"), then trimming it off the input will cause a mismatch, as it's still present in the snippet.
Doing "only trim it when the snippet doesn't end with a newline" also doesn't work because it could be println!("uwu\n").
We would need to actually count the trailing newlines and match and only trim it off when the input has one more, but doing it like I did is a little simpler.

I will explain this better in the comment.

Copy link
Member

@compiler-errors compiler-errors Dec 27, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, yeah, I guess the important part is that we can't distinguish when the newline is added by the macro or already present in the literal. Oh well.

let Ok(unescaped) = unescape_string(snippet) else {
return InputStringKind::NotALiteral;
};

let unescaped_no_nl = unescaped.trim_end_matches('\n');

if unescaped_no_nl != input_no_nl {
// The source string that we're pointing at isn't our input, so spans pointing at it will be incorrect.
// This can for example happen with proc macros that respan generated literals.
return InputStringKind::NotALiteral;
}

let mut s = snippet.char_indices();
let mut width_mappings = vec![];
while let Some((pos, c)) = s.next() {
Expand Down Expand Up @@ -936,7 +968,21 @@ fn find_width_map_from_snippet(
_ => {}
}
}
(width_mappings, true)

InputStringKind::Literal { width_mappings }
}

fn unescape_string(string: &str) -> Result<string::String, unescape::EscapeError> {
let mut buf = string::String::new();
let mut error = Ok(());
unescape::unescape_literal(string, unescape::Mode::Str, &mut |_, unescaped_char| {
match unescaped_char {
Ok(c) => buf.push(c),
Err(err) => error = Err(err),
}
});

error.map(|_| buf)
}

// Assert a reasonable size for `Piece`
Expand Down
29 changes: 12 additions & 17 deletions compiler/rustc_span/src/source_map.rs
Original file line number Diff line number Diff line change
Expand Up @@ -964,45 +964,40 @@ impl SourceMap {

/// Finds the width of the character, either before or after the end of provided span,
/// depending on the `forwards` parameter.
#[instrument(skip(self, sp))]
fn find_width_of_character_at_span(&self, sp: Span, forwards: bool) -> u32 {
let sp = sp.data();

if sp.lo == sp.hi && !forwards {
debug!("find_width_of_character_at_span: early return empty span");
debug!("early return empty span");
return 1;
}

let local_begin = self.lookup_byte_offset(sp.lo);
let local_end = self.lookup_byte_offset(sp.hi);
debug!(
"find_width_of_character_at_span: local_begin=`{:?}`, local_end=`{:?}`",
local_begin, local_end
);
debug!("local_begin=`{:?}`, local_end=`{:?}`", local_begin, local_end);

if local_begin.sf.start_pos != local_end.sf.start_pos {
debug!("find_width_of_character_at_span: begin and end are in different files");
debug!("begin and end are in different files");
return 1;
}

let start_index = local_begin.pos.to_usize();
let end_index = local_end.pos.to_usize();
debug!(
"find_width_of_character_at_span: start_index=`{:?}`, end_index=`{:?}`",
start_index, end_index
);
debug!("start_index=`{:?}`, end_index=`{:?}`", start_index, end_index);

// Disregard indexes that are at the start or end of their spans, they can't fit bigger
// characters.
if (!forwards && end_index == usize::MIN) || (forwards && start_index == usize::MAX) {
debug!("find_width_of_character_at_span: start or end of span, cannot be multibyte");
debug!("start or end of span, cannot be multibyte");
return 1;
}

let source_len = (local_begin.sf.end_pos - local_begin.sf.start_pos).to_usize();
debug!("find_width_of_character_at_span: source_len=`{:?}`", source_len);
debug!("source_len=`{:?}`", source_len);
// Ensure indexes are also not malformed.
if start_index > end_index || end_index > source_len - 1 {
debug!("find_width_of_character_at_span: source indexes are malformed");
debug!("source indexes are malformed");
return 1;
}

Expand All @@ -1017,10 +1012,10 @@ impl SourceMap {
} else {
return 1;
};
debug!("find_width_of_character_at_span: snippet=`{:?}`", snippet);
debug!("snippet=`{:?}`", snippet);

let mut target = if forwards { end_index + 1 } else { end_index - 1 };
debug!("find_width_of_character_at_span: initial target=`{:?}`", target);
debug!("initial target=`{:?}`", target);

while !snippet.is_char_boundary(target - start_index) && target < source_len {
target = if forwards {
Expand All @@ -1033,9 +1028,9 @@ impl SourceMap {
}
}
};
debug!("find_width_of_character_at_span: target=`{:?}`", target);
debug!("target=`{:?}`", target);
}
debug!("find_width_of_character_at_span: final target=`{:?}`", target);
debug!("final target=`{:?}`", target);

if forwards { (target - end_index) as u32 } else { (end_index - target) as u32 }
}
Expand Down
14 changes: 13 additions & 1 deletion src/test/ui/fmt/auxiliary/format-string-proc-macro.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,8 @@

extern crate proc_macro;

use proc_macro::{Literal, Span, TokenStream, TokenTree};
use proc_macro::{Delimiter, Group, Ident, Literal, Punct, Spacing, Span, TokenStream, TokenTree};
use std::iter::FromIterator;

#[proc_macro]
pub fn foo_with_input_span(input: TokenStream) -> TokenStream {
Expand All @@ -26,3 +27,14 @@ pub fn err_with_input_span(input: TokenStream) -> TokenStream {

TokenStream::from(TokenTree::Literal(lit))
}

#[proc_macro]
pub fn respan_to_invalid_format_literal(input: TokenStream) -> TokenStream {
let mut s = Literal::string("{");
s.set_span(input.into_iter().next().unwrap().span());
TokenStream::from_iter([
TokenTree::from(Ident::new("format", Span::call_site())),
TokenTree::from(Punct::new('!', Spacing::Alone)),
TokenTree::from(Group::new(Delimiter::Parenthesis, TokenTree::from(s).into())),
])
}
10 changes: 10 additions & 0 deletions src/test/ui/fmt/respanned-literal-issue-106191.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// aux-build:format-string-proc-macro.rs

extern crate format_string_proc_macro;

fn main() {
format_string_proc_macro::respan_to_invalid_format_literal!("¡");
//~^ ERROR invalid format string: expected `'}'` but string was terminated
format_args!(r#concat!("¡ {"));
//~^ ERROR invalid format string: expected `'}'` but string was terminated
}
19 changes: 19 additions & 0 deletions src/test/ui/fmt/respanned-literal-issue-106191.stderr
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
error: invalid format string: expected `'}'` but string was terminated
--> $DIR/respanned-literal-issue-106191.rs:6:65
|
LL | format_string_proc_macro::respan_to_invalid_format_literal!("¡");
| ^^^ expected `'}'` in format string
|
= note: if you intended to print `{`, you can escape it using `{{`

error: invalid format string: expected `'}'` but string was terminated
--> $DIR/respanned-literal-issue-106191.rs:8:18
|
LL | format_args!(r#concat!("¡ {"));
| ^^^^^^^^^^^^^^^^^^^^^^^ expected `'}'` in format string
|
= note: if you intended to print `{`, you can escape it using `{{`
= note: this error originates in the macro `concat` (in Nightly builds, run with -Z macro-backtrace for more info)

error: aborting due to 2 previous errors