Skip to content

Commit

Permalink
Auto merge of rust-lang#2647 - saethlin:current-span, r=RalfJung
Browse files Browse the repository at this point in the history
Track local frames incrementally during execution

rust-lang/miri#2646 currently introduces a performance regression. This change removes that regression, and provides a minor perf improvement.

The existing lazy strategy for tracking the span we want to display is as efficient as it is only because we often create a `CurrentSpan` then never call `.get()`. Most of the calls to the `before_memory_read` and `before_memory_write` hooks do not create any event that we store in `AllocHistory`. But data races are totally different, any memory read or write may race, so every call to those hooks needs to access to the current local span.

So this changes to a strategy where we update some state in a `Thread` and `FrameExtra` incrementally, upon entering and existing each function call.

Before:
```
Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/backtraces/Cargo.toml
  Time (mean ± σ):      5.532 s ±  0.022 s    [User: 5.444 s, System: 0.073 s]
  Range (min … max):    5.516 s …  5.569 s    5 runs

Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/mse/Cargo.toml
  Time (mean ± σ):     831.4 ms ±   3.0 ms    [User: 783.8 ms, System: 46.7 ms]
  Range (min … max):   828.7 ms … 836.1 ms    5 runs

Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/serde1/Cargo.toml
  Time (mean ± σ):      1.975 s ±  0.021 s    [User: 1.914 s, System: 0.059 s]
  Range (min … max):    1.939 s …  1.990 s    5 runs

Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/serde2/Cargo.toml
  Time (mean ± σ):      4.060 s ±  0.051 s    [User: 3.983 s, System: 0.071 s]
  Range (min … max):    3.972 s …  4.100 s    5 runs

Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/slice-get-unchecked/Cargo.toml
  Time (mean ± σ):     784.9 ms ±   8.2 ms    [User: 746.5 ms, System: 37.7 ms]
  Range (min … max):   772.9 ms … 793.3 ms    5 runs

Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/unicode/Cargo.toml
  Time (mean ± σ):      1.679 s ±  0.006 s    [User: 1.623 s, System: 0.055 s]
  Range (min … max):    1.673 s …  1.687 s    5 runs
```
After:
```
Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/backtraces/Cargo.toml
  Time (mean ± σ):      5.330 s ±  0.037 s    [User: 5.232 s, System: 0.084 s]
  Range (min … max):    5.280 s …  5.383 s    5 runs

Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/mse/Cargo.toml
  Time (mean ± σ):     818.9 ms ±   3.7 ms    [User: 776.8 ms, System: 41.3 ms]
  Range (min … max):   813.5 ms … 822.5 ms    5 runs

Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/serde1/Cargo.toml
  Time (mean ± σ):      1.927 s ±  0.011 s    [User: 1.864 s, System: 0.061 s]
  Range (min … max):    1.917 s …  1.945 s    5 runs

Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/serde2/Cargo.toml
  Time (mean ± σ):      3.974 s ±  0.020 s    [User: 3.893 s, System: 0.076 s]
  Range (min … max):    3.956 s …  4.004 s    5 runs

Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/slice-get-unchecked/Cargo.toml
  Time (mean ± σ):     780.0 ms ±   5.3 ms    [User: 740.3 ms, System: 39.0 ms]
  Range (min … max):   771.2 ms … 784.5 ms    5 runs

Benchmark 1: cargo +miri miri run --manifest-path /home/ben/miri/bench-cargo-miri/unicode/Cargo.toml
  Time (mean ± σ):      1.643 s ±  0.007 s    [User: 1.584 s, System: 0.058 s]
  Range (min … max):    1.635 s …  1.654 s    5 runs
```
(This change is marginal, but the point is that it avoids a much more significant regression)
  • Loading branch information
bors committed Nov 26, 2022
2 parents 2f92cf2 + 86052a9 commit fa91fa7
Show file tree
Hide file tree
Showing 6 changed files with 153 additions and 161 deletions.
55 changes: 52 additions & 3 deletions src/concurrency/thread.rs
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,13 @@ pub struct Thread<'mir, 'tcx> {
/// The virtual call stack.
stack: Vec<Frame<'mir, 'tcx, Provenance, FrameData<'tcx>>>,

/// The index of the topmost user-relevant frame in `stack`. This field must contain
/// the value produced by `get_top_user_relevant_frame`.
/// The `None` state here represents
/// This field is a cache to reduce how often we call that method. The cache is manually
/// maintained inside `MiriMachine::after_stack_push` and `MiriMachine::after_stack_pop`.
top_user_relevant_frame: Option<usize>,

/// The join status.
join_status: ThreadJoinStatus,

Expand Down Expand Up @@ -147,6 +154,40 @@ impl<'mir, 'tcx> Thread<'mir, 'tcx> {
fn thread_name(&self) -> &[u8] {
if let Some(ref thread_name) = self.thread_name { thread_name } else { b"<unnamed>" }
}

/// Return the top user-relevant frame, if there is one.
/// Note that the choice to return `None` here when there is no user-relevant frame is part of
/// justifying the optimization that only pushes of user-relevant frames require updating the
/// `top_user_relevant_frame` field.
fn compute_top_user_relevant_frame(&self) -> Option<usize> {
self.stack
.iter()
.enumerate()
.rev()
.find_map(|(idx, frame)| if frame.extra.is_user_relevant { Some(idx) } else { None })
}

/// Re-compute the top user-relevant frame from scratch.
pub fn recompute_top_user_relevant_frame(&mut self) {
self.top_user_relevant_frame = self.compute_top_user_relevant_frame();
}

/// Set the top user-relevant frame to the given value. Must be equal to what
/// `get_top_user_relevant_frame` would return!
pub fn set_top_user_relevant_frame(&mut self, frame_idx: usize) {
debug_assert_eq!(Some(frame_idx), self.compute_top_user_relevant_frame());
self.top_user_relevant_frame = Some(frame_idx);
}

/// Returns the topmost frame that is considered user-relevant, or the
/// top of the stack if there is no such frame, or `None` if the stack is empty.
pub fn top_user_relevant_frame(&self) -> Option<usize> {
debug_assert_eq!(self.top_user_relevant_frame, self.compute_top_user_relevant_frame());
// This can be called upon creation of an allocation. We create allocations while setting up
// parts of the Rust runtime when we do not have any stack frames yet, so we need to handle
// empty stacks.
self.top_user_relevant_frame.or_else(|| self.stack.len().checked_sub(1))
}
}

impl<'mir, 'tcx> std::fmt::Debug for Thread<'mir, 'tcx> {
Expand All @@ -167,6 +208,7 @@ impl<'mir, 'tcx> Default for Thread<'mir, 'tcx> {
state: ThreadState::Enabled,
thread_name: None,
stack: Vec::new(),
top_user_relevant_frame: None,
join_status: ThreadJoinStatus::Joinable,
panic_payload: None,
last_error: None,
Expand All @@ -184,8 +226,15 @@ impl<'mir, 'tcx> Thread<'mir, 'tcx> {

impl VisitTags for Thread<'_, '_> {
fn visit_tags(&self, visit: &mut dyn FnMut(SbTag)) {
let Thread { panic_payload, last_error, stack, state: _, thread_name: _, join_status: _ } =
self;
let Thread {
panic_payload,
last_error,
stack,
top_user_relevant_frame: _,
state: _,
thread_name: _,
join_status: _,
} = self;

panic_payload.visit_tags(visit);
last_error.visit_tags(visit);
Expand Down Expand Up @@ -414,7 +463,7 @@ impl<'mir, 'tcx: 'mir> ThreadManager<'mir, 'tcx> {
}

/// Get a shared borrow of the currently active thread.
fn active_thread_ref(&self) -> &Thread<'mir, 'tcx> {
pub fn active_thread_ref(&self) -> &Thread<'mir, 'tcx> {
&self.threads[self.active_thread]
}

Expand Down
76 changes: 20 additions & 56 deletions src/helpers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -936,78 +936,42 @@ pub trait EvalContextExt<'mir, 'tcx: 'mir>: crate::MiriInterpCxExt<'mir, 'tcx> {
}

impl<'mir, 'tcx> MiriMachine<'mir, 'tcx> {
pub fn current_span(&self) -> CurrentSpan<'_, 'mir, 'tcx> {
CurrentSpan { current_frame_idx: None, machine: self }
}
}

/// A `CurrentSpan` should be created infrequently (ideally once) per interpreter step. It does
/// nothing on creation, but when `CurrentSpan::get` is called, searches the current stack for the
/// topmost frame which corresponds to a local crate, and returns the current span in that frame.
/// The result of that search is cached so that later calls are approximately free.
#[derive(Clone)]
pub struct CurrentSpan<'a, 'mir, 'tcx> {
current_frame_idx: Option<usize>,
machine: &'a MiriMachine<'mir, 'tcx>,
}

impl<'a, 'mir: 'a, 'tcx: 'a + 'mir> CurrentSpan<'a, 'mir, 'tcx> {
pub fn machine(&self) -> &'a MiriMachine<'mir, 'tcx> {
self.machine
}

/// Get the current span, skipping non-local frames.
/// Get the current span in the topmost function which is workspace-local and not
/// `#[track_caller]`.
/// This function is backed by a cache, and can be assumed to be very fast.
pub fn get(&mut self) -> Span {
let idx = self.current_frame_idx();
self.stack().get(idx).map(Frame::current_span).unwrap_or(rustc_span::DUMMY_SP)
/// It will work even when the stack is empty.
pub fn current_span(&self) -> Span {
self.top_user_relevant_frame()
.map(|frame_idx| self.stack()[frame_idx].current_span())
.unwrap_or(rustc_span::DUMMY_SP)
}

/// Returns the span of the *caller* of the current operation, again
/// walking down the stack to find the closest frame in a local crate, if the caller of the
/// current operation is not in a local crate.
/// This is useful when we are processing something which occurs on function-entry and we want
/// to point at the call to the function, not the function definition generally.
pub fn get_caller(&mut self) -> Span {
pub fn caller_span(&self) -> Span {
// We need to go down at least to the caller (len - 2), or however
// far we have to go to find a frame in a local crate.
let local_frame_idx = self.current_frame_idx();
let stack = self.stack();
let idx = cmp::min(local_frame_idx, stack.len().saturating_sub(2));
stack.get(idx).map(Frame::current_span).unwrap_or(rustc_span::DUMMY_SP)
// far we have to go to find a frame in a local crate which is also not #[track_caller].
let frame_idx = self.top_user_relevant_frame().unwrap();
let frame_idx = cmp::min(frame_idx, self.stack().len().checked_sub(2).unwrap());
self.stack()[frame_idx].current_span()
}

fn stack(&self) -> &[Frame<'mir, 'tcx, Provenance, machine::FrameData<'tcx>>] {
self.machine.threads.active_thread_stack()
self.threads.active_thread_stack()
}

fn current_frame_idx(&mut self) -> usize {
*self
.current_frame_idx
.get_or_insert_with(|| Self::compute_current_frame_index(self.machine))
fn top_user_relevant_frame(&self) -> Option<usize> {
self.threads.active_thread_ref().top_user_relevant_frame()
}

// Find the position of the inner-most frame which is part of the crate being
// compiled/executed, part of the Cargo workspace, and is also not #[track_caller].
#[inline(never)]
fn compute_current_frame_index(machine: &MiriMachine<'_, '_>) -> usize {
machine
.threads
.active_thread_stack()
.iter()
.enumerate()
.rev()
.find_map(|(idx, frame)| {
let def_id = frame.instance.def_id();
if (def_id.is_local() || machine.local_crates.contains(&def_id.krate))
&& !frame.instance.def.requires_caller_location(machine.tcx)
{
Some(idx)
} else {
None
}
})
.unwrap_or(0)
/// This is the source of truth for the `is_user_relevant` flag in our `FrameExtra`.
pub fn is_user_relevant(&self, frame: &Frame<'mir, 'tcx, Provenance>) -> bool {
let def_id = frame.instance.def_id();
(def_id.is_local() || self.local_crates.contains(&def_id.krate))
&& !frame.instance.def.requires_caller_location(self.tcx)
}
}

Expand Down
2 changes: 1 addition & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ pub use crate::diagnostics::{
pub use crate::eval::{
create_ecx, eval_entry, AlignmentCheck, BacktraceStyle, IsolatedOp, MiriConfig, RejectOpWith,
};
pub use crate::helpers::{CurrentSpan, EvalContextExt as _};
pub use crate::helpers::EvalContextExt as _;
pub use crate::intptrcast::ProvenanceMode;
pub use crate::machine::{
AllocExtra, FrameData, MiriInterpCx, MiriInterpCxExt, MiriMachine, MiriMemoryKind,
Expand Down
43 changes: 28 additions & 15 deletions src/machine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,12 +50,18 @@ pub struct FrameData<'tcx> {
/// for the start of this frame. When we finish executing this frame,
/// we use this to register a completed event with `measureme`.
pub timing: Option<measureme::DetachedTiming>,

/// Indicates whether a `Frame` is part of a workspace-local crate and is also not
/// `#[track_caller]`. We compute this once on creation and store the result, as an
/// optimization.
/// This is used by `MiriMachine::current_span` and `MiriMachine::caller_span`
pub is_user_relevant: bool,
}

impl<'tcx> std::fmt::Debug for FrameData<'tcx> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
// Omitting `timing`, it does not support `Debug`.
let FrameData { stacked_borrows, catch_unwind, timing: _ } = self;
let FrameData { stacked_borrows, catch_unwind, timing: _, is_user_relevant: _ } = self;
f.debug_struct("FrameData")
.field("stacked_borrows", stacked_borrows)
.field("catch_unwind", catch_unwind)
Expand All @@ -65,7 +71,7 @@ impl<'tcx> std::fmt::Debug for FrameData<'tcx> {

impl VisitTags for FrameData<'_> {
fn visit_tags(&self, visit: &mut dyn FnMut(SbTag)) {
let FrameData { catch_unwind, stacked_borrows, timing: _ } = self;
let FrameData { catch_unwind, stacked_borrows, timing: _, is_user_relevant: _ } = self;

catch_unwind.visit_tags(visit);
stacked_borrows.visit_tags(visit);
Expand Down Expand Up @@ -895,13 +901,7 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {

let alloc = alloc.into_owned();
let stacks = ecx.machine.stacked_borrows.as_ref().map(|stacked_borrows| {
Stacks::new_allocation(
id,
alloc.size(),
stacked_borrows,
kind,
ecx.machine.current_span(),
)
Stacks::new_allocation(id, alloc.size(), stacked_borrows, kind, &ecx.machine)
});
let race_alloc = ecx.machine.data_race.as_ref().map(|data_race| {
data_race::AllocExtra::new_allocation(
Expand Down Expand Up @@ -1016,8 +1016,7 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
prov_extra,
range,
machine.stacked_borrows.as_ref().unwrap(),
machine.current_span(),
&machine.threads,
machine,
)?;
}
if let Some(weak_memory) = &alloc_extra.weak_memory {
Expand Down Expand Up @@ -1048,8 +1047,7 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
prov_extra,
range,
machine.stacked_borrows.as_ref().unwrap(),
machine.current_span(),
&machine.threads,
machine,
)?;
}
if let Some(weak_memory) = &alloc_extra.weak_memory {
Expand Down Expand Up @@ -1083,8 +1081,7 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
prove_extra,
range,
machine.stacked_borrows.as_ref().unwrap(),
machine.current_span(),
&machine.threads,
machine,
)
} else {
Ok(())
Expand Down Expand Up @@ -1126,7 +1123,9 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
stacked_borrows: stacked_borrows.map(|sb| sb.borrow_mut().new_frame(&ecx.machine)),
catch_unwind: None,
timing,
is_user_relevant: ecx.machine.is_user_relevant(&frame),
};

Ok(frame.with_extra(extra))
}

Expand Down Expand Up @@ -1174,6 +1173,13 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {

#[inline(always)]
fn after_stack_push(ecx: &mut InterpCx<'mir, 'tcx, Self>) -> InterpResult<'tcx> {
if ecx.frame().extra.is_user_relevant {
// We just pushed a local frame, so we know that the topmost local frame is the topmost
// frame. If we push a non-local frame, there's no need to do anything.
let stack_len = ecx.active_thread_stack().len();
ecx.active_thread_mut().set_top_user_relevant_frame(stack_len - 1);
}

if ecx.machine.stacked_borrows.is_some() { ecx.retag_return_place() } else { Ok(()) }
}

Expand All @@ -1183,6 +1189,13 @@ impl<'mir, 'tcx> Machine<'mir, 'tcx> for MiriMachine<'mir, 'tcx> {
mut frame: Frame<'mir, 'tcx, Provenance, FrameData<'tcx>>,
unwinding: bool,
) -> InterpResult<'tcx, StackPopJump> {
if frame.extra.is_user_relevant {
// All that we store is whether or not the frame we just removed is local, so now we
// have no idea where the next topmost local frame is. So we recompute it.
// (If this ever becomes a bottleneck, we could have `push` store the previous
// user-relevant frame and restore that here.)
ecx.active_thread_mut().recompute_top_user_relevant_frame();
}
let timing = frame.extra.timing.take();
if let Some(stacked_borrows) = &ecx.machine.stacked_borrows {
stacked_borrows.borrow_mut().end_call(&frame.extra);
Expand Down
Loading

0 comments on commit fa91fa7

Please sign in to comment.