From 6971e9332d9a7fb1567a6df0f30d03452f505ed3 Mon Sep 17 00:00:00 2001 From: Zalathar Date: Fri, 15 Dec 2023 14:12:46 +1100 Subject: [PATCH] coverage: `llvm-cov` expects column numbers to be bytes, not code points --- .../rustc_mir_transform/src/coverage/mod.rs | 72 +++++++++++++++---- compiler/rustc_mir_transform/src/lib.rs | 1 + tests/coverage/unicode.cov-map | 22 +++--- tests/coverage/unicode.coverage | 40 +++++++++++ tests/coverage/unicode.rs | 2 +- 5 files changed, 111 insertions(+), 26 deletions(-) create mode 100644 tests/coverage/unicode.coverage diff --git a/compiler/rustc_mir_transform/src/coverage/mod.rs b/compiler/rustc_mir_transform/src/coverage/mod.rs index 98de9d829d28b..dcd7014f4fc90 100644 --- a/compiler/rustc_mir_transform/src/coverage/mod.rs +++ b/compiler/rustc_mir_transform/src/coverage/mod.rs @@ -23,7 +23,7 @@ use rustc_middle::mir::{ use rustc_middle::ty::TyCtxt; use rustc_span::def_id::LocalDefId; use rustc_span::source_map::SourceMap; -use rustc_span::{Span, Symbol}; +use rustc_span::{BytePos, Pos, RelativeBytePos, Span, Symbol}; /// Inserts `StatementKind::Coverage` statements that either instrument the binary with injected /// counters, via intrinsic `llvm.instrprof.increment`, and/or inject metadata used during codegen @@ -258,7 +258,16 @@ fn inject_statement(mir_body: &mut mir::Body<'_>, counter_kind: CoverageKind, bb data.statements.insert(0, statement); } -/// Convert the Span into its file name, start line and column, and end line and column +/// Convert the Span into its file name, start line and column, and end line and column. +/// +/// Line numbers and column numbers are 1-based. Unlike most column numbers emitted by +/// the compiler, these column numbers are denoted in **bytes**, because that's what +/// LLVM's `llvm-cov` tool expects to see in coverage maps. +/// +/// Returns `None` if the conversion failed for some reason. This shouldn't happen, +/// but it's hard to rule out entirely (especially in the presence of complex macros +/// or other expansions), and if it does happen then skipping a span or function is +/// better than an ICE or `llvm-cov` failure that the user might have no way to avoid. fn make_code_region( source_map: &SourceMap, file_name: Symbol, @@ -272,20 +281,55 @@ fn make_code_region( source_map.span_to_diagnostic_string(body_span) ); - let (file, mut start_line, mut start_col, mut end_line, mut end_col) = - source_map.span_to_location_info(span); - if span.hi() == span.lo() { - // Extend an empty span by one character so the region will be counted. - if span.hi() == body_span.hi() { - start_col = start_col.saturating_sub(1); - } else { - end_col = start_col + 1; - } + let lo = span.lo(); + let hi = span.hi(); + + let file = source_map.lookup_source_file(lo); + if !file.contains(hi) { + debug!(?span, ?file, ?lo, ?hi, "span crosses multiple files; skipping"); + return None; + } + + // Column numbers need to be in bytes, so we can't use the more convenient + // `SourceMap` methods for looking up file coordinates. + let rpos_and_line_and_byte_column = |pos: BytePos| -> Option<(RelativeBytePos, usize, usize)> { + let rpos = file.relative_position(pos); + let line_index = file.lookup_line(rpos)?; + let line_start = file.lines()[line_index]; + // Line numbers and column numbers are 1-based, so add 1 to each. + Some((rpos, line_index + 1, (rpos - line_start).to_usize() + 1)) }; - if let Some(file) = file { - start_line = source_map.doctest_offset_line(&file.name, start_line); - end_line = source_map.doctest_offset_line(&file.name, end_line); + + let (lo_rpos, mut start_line, mut start_col) = rpos_and_line_and_byte_column(lo)?; + let (hi_rpos, mut end_line, mut end_col) = rpos_and_line_and_byte_column(hi)?; + + // If the span is empty, try to expand it horizontally by one character's + // worth of bytes, so that it is more visible in `llvm-cov` reports. + // We do this after resolving line/column numbers, so that empty spans at the + // end of a line get an extra column instead of wrapping to the next line. + if span.is_empty() + && body_span.contains(span) + && let Some(src) = &file.src + { + // Prefer to expand the end position, if it won't go outside the body span. + if hi < body_span.hi() { + let hi_rpos = hi_rpos.to_usize(); + let nudge_bytes = src.ceil_char_boundary(hi_rpos + 1) - hi_rpos; + end_col += nudge_bytes; + } else if lo > body_span.lo() { + let lo_rpos = lo_rpos.to_usize(); + let nudge_bytes = lo_rpos - src.floor_char_boundary(lo_rpos - 1); + // Subtract the nudge, but don't go below column 1. + start_col = start_col.saturating_sub(nudge_bytes).max(1); + } + // If neither nudge could be applied, stick with the empty span coordinates. } + + // Apply an offset so that code in doctests has correct line numbers. + // FIXME(#79417): Currently we have no way to offset doctest _columns_. + start_line = source_map.doctest_offset_line(&file.name, start_line); + end_line = source_map.doctest_offset_line(&file.name, end_line); + Some(CodeRegion { file_name, start_line: start_line as u32, diff --git a/compiler/rustc_mir_transform/src/lib.rs b/compiler/rustc_mir_transform/src/lib.rs index f5f51c0ec8ad1..2c1602dadc17c 100644 --- a/compiler/rustc_mir_transform/src/lib.rs +++ b/compiler/rustc_mir_transform/src/lib.rs @@ -9,6 +9,7 @@ #![feature(min_specialization)] #![feature(never_type)] #![feature(option_get_or_insert_default)] +#![feature(round_char_boundary)] #![feature(trusted_step)] #![feature(try_blocks)] #![feature(yeet_expr)] diff --git a/tests/coverage/unicode.cov-map b/tests/coverage/unicode.cov-map index 7648031f4df69..cd40194a0831b 100644 --- a/tests/coverage/unicode.cov-map +++ b/tests/coverage/unicode.cov-map @@ -1,5 +1,5 @@ Function name: unicode::main -Raw bytes (67): 0x[01, 01, 09, 01, 05, 03, 05, 1e, 0d, 22, 09, 03, 05, 11, 1b, 1e, 0d, 22, 09, 03, 05, 09, 01, 0e, 01, 00, 0b, 05, 01, 09, 00, 0b, 03, 00, 0f, 00, 18, 05, 00, 19, 00, 24, 22, 02, 08, 00, 13, 09, 00, 17, 00, 22, 11, 00, 23, 02, 06, 1b, 02, 06, 00, 07, 17, 02, 05, 01, 02] +Raw bytes (67): 0x[01, 01, 09, 01, 05, 03, 05, 1e, 0d, 22, 09, 03, 05, 11, 1b, 1e, 0d, 22, 09, 03, 05, 09, 01, 0e, 01, 00, 0b, 05, 01, 09, 00, 0c, 03, 00, 10, 00, 1b, 05, 00, 1c, 00, 28, 22, 02, 08, 00, 25, 09, 00, 29, 00, 46, 11, 00, 47, 02, 06, 1b, 02, 06, 00, 07, 17, 02, 05, 01, 02] Number of files: 1 - file 0 => global file 1 Number of expressions: 9 @@ -14,34 +14,34 @@ Number of expressions: 9 - expression 8 operands: lhs = Expression(0, Add), rhs = Counter(1) Number of file 0 mappings: 9 - Code(Counter(0)) at (prev + 14, 1) to (start + 0, 11) -- Code(Counter(1)) at (prev + 1, 9) to (start + 0, 11) -- Code(Expression(0, Add)) at (prev + 0, 15) to (start + 0, 24) +- Code(Counter(1)) at (prev + 1, 9) to (start + 0, 12) +- Code(Expression(0, Add)) at (prev + 0, 16) to (start + 0, 27) = (c0 + c1) -- Code(Counter(1)) at (prev + 0, 25) to (start + 0, 36) -- Code(Expression(8, Sub)) at (prev + 2, 8) to (start + 0, 19) +- Code(Counter(1)) at (prev + 0, 28) to (start + 0, 40) +- Code(Expression(8, Sub)) at (prev + 2, 8) to (start + 0, 37) = ((c0 + c1) - c1) -- Code(Counter(2)) at (prev + 0, 23) to (start + 0, 34) -- Code(Counter(4)) at (prev + 0, 35) to (start + 2, 6) +- Code(Counter(2)) at (prev + 0, 41) to (start + 0, 70) +- Code(Counter(4)) at (prev + 0, 71) to (start + 2, 6) - Code(Expression(6, Add)) at (prev + 2, 6) to (start + 0, 7) = ((((c0 + c1) - c1) - c2) + c3) - Code(Expression(5, Add)) at (prev + 2, 5) to (start + 1, 2) = (c4 + ((((c0 + c1) - c1) - c2) + c3)) Function name: unicode::サビ -Raw bytes (9): 0x[01, 01, 00, 01, 01, 1e, 12, 00, 14] +Raw bytes (9): 0x[01, 01, 00, 01, 01, 1e, 14, 00, 18] Number of files: 1 - file 0 => global file 1 Number of expressions: 0 Number of file 0 mappings: 1 -- Code(Counter(0)) at (prev + 30, 18) to (start + 0, 20) +- Code(Counter(0)) at (prev + 30, 20) to (start + 0, 24) Function name: unicode::他 (unused) -Raw bytes (9): 0x[01, 01, 00, 01, 00, 1e, 15, 00, 1f] +Raw bytes (9): 0x[01, 01, 00, 01, 00, 1e, 19, 00, 25] Number of files: 1 - file 0 => global file 1 Number of expressions: 0 Number of file 0 mappings: 1 -- Code(Zero) at (prev + 30, 21) to (start + 0, 31) +- Code(Zero) at (prev + 30, 25) to (start + 0, 37) Function name: unicode::申し訳ございません Raw bytes (9): 0x[01, 01, 00, 01, 01, 18, 01, 02, 02] diff --git a/tests/coverage/unicode.coverage b/tests/coverage/unicode.coverage new file mode 100644 index 0000000000000..b284a557d5754 --- /dev/null +++ b/tests/coverage/unicode.coverage @@ -0,0 +1,40 @@ + LL| |// edition: 2021 + LL| |// ignore-windows - we can't force `llvm-cov` to use ANSI escapes on Windows + LL| |// llvm-cov-flags: --use-color + LL| | + LL| |// Check that column numbers are denoted in bytes, so that they don't cause + LL| |// `llvm-cov` to fail or emit malformed output. + LL| |// + LL| |// Note that when `llvm-cov` prints ^ arrows on a subsequent line, it simply + LL| |// inserts one space character for each "column", with no understanding of + LL| |// Unicode or character widths. So those arrows will tend to be misaligned + LL| |// for non-ASCII source code, regardless of whether column numbers are code + LL| |// points or bytes. + LL| | + LL| 1|fn main() { + LL| 33| for _İ in 'А'..='Я' { /* Я */ } + ^32 ^32 + LL| | + LL| 1| if 申し訳ございません() && 申し訳ございません() { + ^0 + LL| 0| println!("true"); + LL| 1| } + LL| | + LL| 1| サビ(); + LL| 1|} + LL| | + LL| 1|fn 申し訳ございません() -> bool { + LL| 1| std::hint::black_box(false) + LL| 1|} + LL| | + LL| |macro_rules! macro_that_defines_a_function { + LL| | (fn $名:ident () $体:tt) => { + LL| 1| fn $名 () $体 fn 他 () {} + ^0 + LL| | } + LL| |} + LL| | + LL| |macro_that_defines_a_function! { + LL| | fn サビ() {} + LL| |} + diff --git a/tests/coverage/unicode.rs b/tests/coverage/unicode.rs index 3335d3af458a5..dfc5ea69dd231 100644 --- a/tests/coverage/unicode.rs +++ b/tests/coverage/unicode.rs @@ -1,5 +1,5 @@ // edition: 2021 -// ignore-mode-coverage-run - `llvm-cov` fails due to incorrectly-split UTF-8 +// ignore-windows - we can't force `llvm-cov` to use ANSI escapes on Windows // llvm-cov-flags: --use-color // Check that column numbers are denoted in bytes, so that they don't cause