diff --git a/cranelift/codegen/src/alias_analysis.rs b/cranelift/codegen/src/alias_analysis.rs new file mode 100644 index 000000000000..ba76d13c9eb0 --- /dev/null +++ b/cranelift/codegen/src/alias_analysis.rs @@ -0,0 +1,388 @@ +//! Alias analysis, consisting of a "last store" pass and a "memory +//! values" pass. These two passes operate as one fused pass, and so +//! are implemented together here. +//! +//! We partition memory state into several *disjoint pieces* of +//! "abstract state". There are a finite number of such pieces: +//! currently, we call them "heap", "table", "vmctx", and "other".Any +//! given address in memory belongs to exactly one disjoint piece. +//! +//! One never tracks which piece a concrete address belongs to at +//! runtime; this is a purely static concept. Instead, all +//! memory-accessing instructions (loads and stores) are labeled with +//! one of these four categories in the `MemFlags`. It is forbidden +//! for a load or store to access memory under one category and a +//! later load or store to access the same memory under a different +//! category. This is ensured to be true by construction during +//! frontend translation into CLIF and during legalization. +//! +//! Given that this non-aliasing property is ensured by the producer +//! of CLIF, we can compute a *may-alias* property: one load or store +//! may-alias another load or store if both access the same category +//! of abstract state. +//! +//! The "last store" pass helps to compute this aliasing: it scans the +//! code, finding at each program point the last instruction that +//! *might have* written to a given part of abstract state. +//! +//! We can't say for sure that the "last store" *did* actually write +//! that state, but we know for sure that no instruction *later* than +//! it (up to the current instruction) did. However, we can get a +//! must-alias property from this: if at a given load or store, we +//! look backward to the "last store", *AND* we find that it has +//! exactly the same address expression and type, then we know that +//! the current instruction's access *must* be to the same memory +//! location. +//! +//! To get this must-alias property, we compute a sparse table of +//! "memory values": these are known equivalences between SSA `Value`s +//! and particular locations in memory. The memory-values table is a +//! mapping from (last store, address expression, type) to SSA +//! value. At a store, we can insert into this table directly. At a +//! load, we can also insert, if we don't already have a value (from +//! the store that produced the load's value). +//! +//! Then we can do two optimizations at once given this table. If a +//! load accesses a location identified by a (last store, address, +//! type) key already in the table, we replace it with the SSA value +//! for that memory location. This is usually known as "redundant load +//! elimination" if the value came from an earlier load of the same +//! location, or "store-to-load forwarding" if the value came from an +//! earlier store to the same location. +//! +//! In theory we could also do *dead-store elimination*, where if a +//! store overwrites a key in the table, *and* if no other load/store +//! to the abstract state category occurred, *and* no other trapping +//! instruction occurred (at which point we need an up-to-date memory +//! state because post-trap-termination memory state can be observed), +//! *and* we can prove the original store could not have trapped, then +//! we can eliminate the original store. Because this is so complex, +//! and the conditions for doing it correctly when post-trap state +//! must be correct likely reduce the potential benefit, we don't yet +//! do this. + +use crate::{ + cursor::{Cursor, FuncCursor}, + dominator_tree::DominatorTree, + fx::{FxHashMap, FxHashSet}, + inst_predicates::{ + has_memory_fence_semantics, inst_addr_offset_type, inst_store_data, visit_block_succs, + }, + ir::{immediates::Offset32, Block, Function, Inst, Opcode, Type, Value}, +}; +use cranelift_entity::{packed_option::PackedOption, EntityRef}; + +/// For a given program point, the vector of last-store instruction +/// indices for each disjoint category of abstract state. +#[derive(Clone, Copy, Debug, Default, PartialEq, Eq)] +struct LastStores { + heap: PackedOption, + table: PackedOption, + vmctx: PackedOption, + other: PackedOption, +} + +impl LastStores { + fn update(&mut self, func: &Function, inst: Inst) { + let opcode = func.dfg[inst].opcode(); + if has_memory_fence_semantics(opcode) { + self.heap = inst.into(); + self.table = inst.into(); + self.vmctx = inst.into(); + self.other = inst.into(); + } else if opcode.can_store() { + if let Some(memflags) = func.dfg[inst].memflags() { + if memflags.heap() { + self.heap = inst.into(); + } else if memflags.table() { + self.table = inst.into(); + } else if memflags.vmctx() { + self.vmctx = inst.into(); + } else { + self.other = inst.into(); + } + } else { + self.heap = inst.into(); + self.table = inst.into(); + self.vmctx = inst.into(); + self.other = inst.into(); + } + } + } + + fn get_last_store(&self, func: &Function, inst: Inst) -> PackedOption { + if let Some(memflags) = func.dfg[inst].memflags() { + if memflags.heap() { + self.heap + } else if memflags.table() { + self.table + } else if memflags.vmctx() { + self.vmctx + } else { + self.other + } + } else if func.dfg[inst].opcode().can_load() || func.dfg[inst].opcode().can_store() { + inst.into() + } else { + PackedOption::default() + } + } + + fn meet_from(&mut self, other: &LastStores, loc: Inst) { + let meet = |a: PackedOption, b: PackedOption| -> PackedOption { + match (a.into(), b.into()) { + (None, None) => None.into(), + (Some(a), None) => a, + (None, Some(b)) => b, + (Some(a), Some(b)) if a == b => a, + _ => loc.into(), + } + }; + + self.heap = meet(self.heap, other.heap); + self.table = meet(self.table, other.table); + self.vmctx = meet(self.vmctx, other.vmctx); + self.other = meet(self.other, other.other); + } +} + +/// A key identifying a unique memory location. +/// +/// For the result of a load to be equivalent to the result of another +/// load, or the store data from a store, we need for (i) the +/// "version" of memory (here ensured by having the same last store +/// instruction to touch the disjoint category of abstract state we're +/// accessing); (ii) the address must be the same (here ensured by +/// having the same SSA value, which doesn't change after computed); +/// (iii) the offset must be the same; and (iv) the accessed type and +/// extension mode (e.g., 8-to-32, signed) must be the same. +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +struct MemoryLoc { + last_store: PackedOption, + address: Value, + offset: Offset32, + ty: Type, + /// We keep the *opcode* of the instruction that produced the + /// value we record at this key if the opcode is anything other + /// than an ordinary load or store. This is needed when we + /// consider loads that extend the value: e.g., an 8-to-32 + /// sign-extending load will produce a 32-bit value from an 8-bit + /// value in memory, so we can only reuse that (as part of RLE) + /// for another load with the same extending opcode. + /// + /// We could improve the transform to insert explicit extend ops + /// in place of extending loads when we know the memory value, but + /// we haven't yet done this. + extending_opcode: Option, +} + +/// An alias-analysis pass. +pub struct AliasAnalysis<'a> { + /// The function we're analyzing. + func: &'a mut Function, + + /// The domtree for the function. + domtree: &'a DominatorTree, + + /// Input state to a basic block. + block_input: FxHashMap, + + /// Known memory-value equivalences. This is the result of the + /// analysis. This is a mapping from (last store, address + /// expression, offset, type) to SSA `Value`. + /// + /// We keep the defining inst around for quick dominance checks. + mem_values: FxHashMap, +} + +impl<'a> AliasAnalysis<'a> { + /// Perform an alias analysis pass. + pub fn new(func: &'a mut Function, domtree: &'a DominatorTree) -> AliasAnalysis<'a> { + log::trace!("alias analysis: input is:\n{:?}", func); + let mut analysis = AliasAnalysis { + func, + domtree, + block_input: FxHashMap::default(), + mem_values: FxHashMap::default(), + }; + + analysis.compute_block_input_states(); + analysis + } + + fn compute_block_input_states(&mut self) { + let mut queue = vec![]; + let mut queue_set = FxHashSet::default(); + let entry = self.func.layout.entry_block().unwrap(); + queue.push(entry); + queue_set.insert(entry); + + while let Some(block) = queue.pop() { + queue_set.remove(&block); + let mut state = self + .block_input + .entry(block) + .or_insert_with(|| LastStores::default()) + .clone(); + + log::trace!( + "alias analysis: input to block{} is {:?}", + block.index(), + state + ); + + for inst in self.func.layout.block_insts(block) { + state.update(self.func, inst); + log::trace!("after inst{}: state is {:?}", inst.index(), state); + } + + visit_block_succs(self.func, block, |_inst, succ| { + let succ_first_inst = self + .func + .layout + .block_insts(succ) + .into_iter() + .next() + .unwrap(); + let updated = match self.block_input.get_mut(&succ) { + Some(succ_state) => { + let old = succ_state.clone(); + succ_state.meet_from(&state, succ_first_inst); + *succ_state != old + } + None => { + self.block_input.insert(succ, state.clone()); + true + } + }; + + if updated && queue_set.insert(succ) { + queue.push(succ); + } + }); + } + } + + /// Make a pass and update known-redundant loads to aliased + /// values. We interleave the updates with the memory-location + /// tracking because resolving some aliases may expose others + /// (e.g. in cases of double-indirection with two separate chains + /// of loads). + pub fn compute_and_update_aliases(&mut self) { + let mut pos = FuncCursor::new(self.func); + + while let Some(block) = pos.next_block() { + let mut state = self + .block_input + .get(&block) + .cloned() + .unwrap_or_else(|| LastStores::default()); + + while let Some(inst) = pos.next_inst() { + log::trace!( + "alias analysis: scanning at inst{} with state {:?} ({:?})", + inst.index(), + state, + pos.func.dfg[inst], + ); + + if let Some((address, offset, ty)) = inst_addr_offset_type(pos.func, inst) { + let address = pos.func.dfg.resolve_aliases(address); + let opcode = pos.func.dfg[inst].opcode(); + + if opcode.can_store() { + let store_data = inst_store_data(pos.func, inst).unwrap(); + let store_data = pos.func.dfg.resolve_aliases(store_data); + let mem_loc = MemoryLoc { + last_store: inst.into(), + address, + offset, + ty, + extending_opcode: get_ext_opcode(opcode), + }; + log::trace!( + "alias analysis: at inst{}: store with data v{} at loc {:?}", + inst.index(), + store_data.index(), + mem_loc + ); + self.mem_values.insert(mem_loc, (inst, store_data)); + } else if opcode.can_load() { + let last_store = state.get_last_store(pos.func, inst); + let load_result = pos.func.dfg.inst_results(inst)[0]; + let mem_loc = MemoryLoc { + last_store, + address, + offset, + ty, + extending_opcode: get_ext_opcode(opcode), + }; + log::trace!( + "alias analysis: at inst{}: load with last_store inst{} at loc {:?}", + inst.index(), + last_store.map(|inst| inst.index()).unwrap_or(usize::MAX), + mem_loc + ); + + // Is there a Value already known to be stored + // at this specific memory location? If so, + // we can alias the load result to this + // already-known Value. + // + // Check if the definition dominates this + // location; it might not, if it comes from a + // load (stores will always dominate though if + // their `last_store` survives through + // meet-points to this use-site). + let aliased = if let Some((def_inst, value)) = + self.mem_values.get(&mem_loc).cloned() + { + log::trace!( + " -> sees known value v{} from inst{}", + value.index(), + def_inst.index() + ); + if self.domtree.dominates(def_inst, inst, &pos.func.layout) { + log::trace!( + " -> dominates; value equiv from v{} to v{} inserted", + load_result.index(), + value.index() + ); + + pos.func.dfg.detach_results(inst); + pos.func.dfg.change_to_alias(load_result, value); + pos.remove_inst_and_step_back(); + true + } else { + false + } + } else { + false + }; + + // Otherwise, we can keep *this* load around + // as a new equivalent value. + if !aliased { + log::trace!( + " -> inserting load result v{} at loc {:?}", + load_result.index(), + mem_loc + ); + self.mem_values.insert(mem_loc, (inst, load_result)); + } + } + } + + state.update(pos.func, inst); + } + } + } +} + +fn get_ext_opcode(op: Opcode) -> Option { + debug_assert!(op.can_load() || op.can_store()); + match op { + Opcode::Load | Opcode::Store => None, + _ => Some(op), + } +} diff --git a/cranelift/codegen/src/context.rs b/cranelift/codegen/src/context.rs index bedc5aeb1e85..c97cfd305d20 100644 --- a/cranelift/codegen/src/context.rs +++ b/cranelift/codegen/src/context.rs @@ -9,6 +9,7 @@ //! contexts concurrently. Typically, you would have one context per compilation thread and only a //! single ISA instance. +use crate::alias_analysis::AliasAnalysis; use crate::binemit::CodeInfo; use crate::dce::do_dce; use crate::dominator_tree::DominatorTree; @@ -162,6 +163,11 @@ impl Context { self.remove_constant_phis(isa)?; + if opt_level != OptLevel::None { + self.replace_redundant_loads()?; + self.simple_gvn(isa)?; + } + let result = isa.compile_function(&self.func, self.want_disasm)?; let info = result.code_info(); self.mach_compile_result = Some(result); @@ -341,6 +347,17 @@ impl Context { self.verify_if(fisa) } + /// Replace all redundant loads with the known values in + /// memory. These are loads whose values were already loaded by + /// other loads earlier, as well as loads whose values were stored + /// by a store instruction to the same instruction (so-called + /// "store-to-load forwarding"). + pub fn replace_redundant_loads(&mut self) -> CodegenResult<()> { + let mut analysis = AliasAnalysis::new(&mut self.func, &self.domtree); + analysis.compute_and_update_aliases(); + Ok(()) + } + /// Harvest candidate left-hand sides for superoptimization with Souper. #[cfg(feature = "souper-harvest")] pub fn souper_harvest( diff --git a/cranelift/codegen/src/inst_predicates.rs b/cranelift/codegen/src/inst_predicates.rs index 518487af2b56..8d36742979ce 100644 --- a/cranelift/codegen/src/inst_predicates.rs +++ b/cranelift/codegen/src/inst_predicates.rs @@ -1,6 +1,7 @@ //! Instruction predicates/properties, shared by various analyses. - -use crate::ir::{DataFlowGraph, Function, Inst, InstructionData, Opcode}; +use crate::ir::immediates::Offset32; +use crate::ir::instructions::BranchInfo; +use crate::ir::{Block, DataFlowGraph, Function, Inst, InstructionData, Opcode, Type, Value}; use crate::machinst::ty_bits; use cranelift_entity::EntityRef; @@ -78,3 +79,78 @@ pub fn is_constant_64bit(func: &Function, inst: Inst) -> Option { _ => None, } } + +/// Get the address, offset, and access type from the given instruction, if any. +pub fn inst_addr_offset_type(func: &Function, inst: Inst) -> Option<(Value, Offset32, Type)> { + let data = &func.dfg[inst]; + match data { + InstructionData::Load { arg, offset, .. } => { + let ty = func.dfg.value_type(func.dfg.inst_results(inst)[0]); + Some((*arg, *offset, ty)) + } + InstructionData::LoadNoOffset { arg, .. } => { + let ty = func.dfg.value_type(func.dfg.inst_results(inst)[0]); + Some((*arg, 0.into(), ty)) + } + InstructionData::Store { args, offset, .. } => { + let ty = func.dfg.value_type(args[0]); + Some((args[1], *offset, ty)) + } + InstructionData::StoreNoOffset { args, .. } => { + let ty = func.dfg.value_type(args[0]); + Some((args[1], 0.into(), ty)) + } + _ => None, + } +} + +/// Get the store data, if any, from an instruction. +pub fn inst_store_data(func: &Function, inst: Inst) -> Option { + let data = &func.dfg[inst]; + match data { + InstructionData::Store { args, .. } | InstructionData::StoreNoOffset { args, .. } => { + Some(args[0]) + } + _ => None, + } +} + +/// Determine whether this opcode behaves as a memory fence, i.e., +/// prohibits any moving of memory accesses across it. +pub fn has_memory_fence_semantics(op: Opcode) -> bool { + match op { + Opcode::AtomicRmw + | Opcode::AtomicCas + | Opcode::AtomicLoad + | Opcode::AtomicStore + | Opcode::Fence => true, + Opcode::Call | Opcode::CallIndirect => true, + _ => false, + } +} + +/// Visit all successors of a block with a given visitor closure. +pub(crate) fn visit_block_succs(f: &Function, block: Block, mut visit: F) { + for inst in f.layout.block_likely_branches(block) { + if f.dfg[inst].opcode().is_branch() { + visit_branch_targets(f, inst, &mut visit); + } + } +} + +fn visit_branch_targets(f: &Function, inst: Inst, visit: &mut F) { + match f.dfg[inst].analyze_branch(&f.dfg.value_lists) { + BranchInfo::NotABranch => {} + BranchInfo::SingleDest(dest, _) => { + visit(inst, dest); + } + BranchInfo::Table(table, maybe_dest) => { + if let Some(dest) = maybe_dest { + visit(inst, dest); + } + for &dest in f.jump_tables[table].as_slice() { + visit(inst, dest); + } + } + } +} diff --git a/cranelift/codegen/src/ir/memflags.rs b/cranelift/codegen/src/ir/memflags.rs index 4ff76b623cdd..0c073f49bedd 100644 --- a/cranelift/codegen/src/ir/memflags.rs +++ b/cranelift/codegen/src/ir/memflags.rs @@ -11,9 +11,20 @@ enum FlagBit { Readonly, LittleEndian, BigEndian, + /// Accesses only the "heap" part of abstract state. Used for + /// alias analysis. Mutually exclusive with "table" and "vmctx". + Heap, + /// Accesses only the "table" part of abstract state. Used for + /// alias analysis. Mutually exclusive with "heap" and "vmctx". + Table, + /// Accesses only the "vmctx" part of abstract state. Used for + /// alias analysis. Mutually exclusive with "heap" and "table". + Vmctx, } -const NAMES: [&str; 5] = ["notrap", "aligned", "readonly", "little", "big"]; +const NAMES: [&str; 8] = [ + "notrap", "aligned", "readonly", "little", "big", "heap", "table", "vmctx", +]; /// Endianness of a memory access. #[derive(Clone, Copy, PartialEq, Eq, Debug, Hash)] @@ -110,6 +121,12 @@ impl MemFlags { assert!(!(self.read(FlagBit::LittleEndian) && self.read(FlagBit::BigEndian))); } + /// Set endianness of the memory access, returning new flags. + pub fn with_endianness(mut self, endianness: Endianness) -> Self { + self.set_endianness(endianness); + self + } + /// Test if the `notrap` flag is set. /// /// Normally, trapping is part of the semantics of a load/store operation. If the platform @@ -128,6 +145,12 @@ impl MemFlags { self.set(FlagBit::Notrap) } + /// Set the `notrap` flag, returning new flags. + pub fn with_notrap(mut self) -> Self { + self.set_notrap(); + self + } + /// Test if the `aligned` flag is set. /// /// By default, Cranelift memory instructions work with any unaligned effective address. If the @@ -142,6 +165,12 @@ impl MemFlags { self.set(FlagBit::Aligned) } + /// Set the `aligned` flag, returning new flags. + pub fn with_aligned(mut self) -> Self { + self.set_aligned(); + self + } + /// Test if the `readonly` flag is set. /// /// Loads with this flag have no memory dependencies. @@ -155,6 +184,87 @@ impl MemFlags { pub fn set_readonly(&mut self) { self.set(FlagBit::Readonly) } + + /// Set the `readonly` flag, returning new flags. + pub fn with_readonly(mut self) -> Self { + self.set_readonly(); + self + } + + /// Test if the `heap` bit is set. + /// + /// Loads and stores with this flag accesses the "heap" part of + /// abstract state. This is disjoint from the "table", "vmctx", + /// and "other" parts of abstract state. In concrete terms, this + /// means that behavior is undefined if the same memory is also + /// accessed by another load/store with one of the other + /// alias-analysis bits (`table`, `vmctx`) set, or `heap` not set. + pub fn heap(self) -> bool { + self.read(FlagBit::Heap) + } + + /// Set the `heap` bit. See the notes about mutual exclusion with + /// other bits in `heap()`. + pub fn set_heap(&mut self) { + assert!(!self.table() && !self.vmctx()); + self.set(FlagBit::Heap); + } + + /// Set the `heap` bit, returning new flags. + pub fn with_heap(mut self) -> Self { + self.set_heap(); + self + } + + /// Test if the `table` bit is set. + /// + /// Loads and stores with this flag accesses the "table" part of + /// abstract state. This is disjoint from the "heap", "vmctx", + /// and "other" parts of abstract state. In concrete terms, this + /// means that behavior is undefined if the same memory is also + /// accessed by another load/store with one of the other + /// alias-analysis bits (`heap`, `vmctx`) set, or `table` not set. + pub fn table(self) -> bool { + self.read(FlagBit::Table) + } + + /// Set the `table` bit. See the notes about mutual exclusion with + /// other bits in `table()`. + pub fn set_table(&mut self) { + assert!(!self.heap() && !self.vmctx()); + self.set(FlagBit::Table); + } + + /// Set the `table` bit, returning new flags. + pub fn with_table(mut self) -> Self { + self.set_table(); + self + } + + /// Test if the `vmctx` bit is set. + /// + /// Loads and stores with this flag accesses the "vmctx" part of + /// abstract state. This is disjoint from the "heap", "table", + /// and "other" parts of abstract state. In concrete terms, this + /// means that behavior is undefined if the same memory is also + /// accessed by another load/store with one of the other + /// alias-analysis bits (`heap`, `table`) set, or `vmctx` not set. + pub fn vmctx(self) -> bool { + self.read(FlagBit::Vmctx) + } + + /// Set the `vmctx` bit. See the notes about mutual exclusion with + /// other bits in `vmctx()`. + pub fn set_vmctx(&mut self) { + assert!(!self.heap() && !self.table()); + self.set(FlagBit::Vmctx); + } + + /// Set the `vmctx` bit, returning new flags. + pub fn with_vmctx(mut self) -> Self { + self.set_vmctx(); + self + } } impl fmt::Display for MemFlags { diff --git a/cranelift/codegen/src/lib.rs b/cranelift/codegen/src/lib.rs index 2cd498f9c160..71d649b19d86 100644 --- a/cranelift/codegen/src/lib.rs +++ b/cranelift/codegen/src/lib.rs @@ -91,6 +91,7 @@ pub use crate::entity::packed_option; pub use crate::machinst::buffer::{MachCallSite, MachReloc, MachSrcLoc, MachStackMap, MachTrap}; pub use crate::machinst::TextSectionBuilder; +mod alias_analysis; mod bitset; mod constant_hash; mod context; diff --git a/cranelift/codegen/src/machinst/blockorder.rs b/cranelift/codegen/src/machinst/blockorder.rs index d4ddc15125c1..82fec1968b18 100644 --- a/cranelift/codegen/src/machinst/blockorder.rs +++ b/cranelift/codegen/src/machinst/blockorder.rs @@ -71,8 +71,8 @@ use crate::entity::SecondaryMap; use crate::fx::{FxHashMap, FxHashSet}; +use crate::inst_predicates::visit_block_succs; use crate::ir::{Block, Function, Inst, Opcode}; -use crate::machinst::lower::visit_block_succs; use crate::machinst::*; use smallvec::SmallVec; diff --git a/cranelift/codegen/src/machinst/lower.rs b/cranelift/codegen/src/machinst/lower.rs index aec7072d6044..a496f9a61657 100644 --- a/cranelift/codegen/src/machinst/lower.rs +++ b/cranelift/codegen/src/machinst/lower.rs @@ -9,7 +9,6 @@ use crate::data_value::DataValue; use crate::entity::SecondaryMap; use crate::fx::{FxHashMap, FxHashSet}; use crate::inst_predicates::{has_lowering_side_effect, is_constant_64bit}; -use crate::ir::instructions::BranchInfo; use crate::ir::{ types::{FFLAGS, IFLAGS}, ArgumentPurpose, Block, Constant, ConstantData, DataFlowGraph, ExternalName, Function, @@ -1491,29 +1490,3 @@ impl<'func, I: VCodeInst> LowerCtx for Lower<'func, I> { self.vcode.set_vreg_alias(from, to); } } - -/// Visit all successors of a block with a given visitor closure. -pub(crate) fn visit_block_succs(f: &Function, block: Block, mut visit: F) { - for inst in f.layout.block_likely_branches(block) { - if f.dfg[inst].opcode().is_branch() { - visit_branch_targets(f, inst, &mut visit); - } - } -} - -fn visit_branch_targets(f: &Function, inst: Inst, visit: &mut F) { - match f.dfg[inst].analyze_branch(&f.dfg.value_lists) { - BranchInfo::NotABranch => {} - BranchInfo::SingleDest(dest, _) => { - visit(inst, dest); - } - BranchInfo::Table(table, maybe_dest) => { - if let Some(dest) = maybe_dest { - visit(inst, dest); - } - for &dest in f.jump_tables[table].as_slice() { - visit(inst, dest); - } - } - } -} diff --git a/cranelift/filetests/filetests/alias/categories.clif b/cranelift/filetests/filetests/alias/categories.clif new file mode 100644 index 000000000000..1e2c2b3e3a05 --- /dev/null +++ b/cranelift/filetests/filetests/alias/categories.clif @@ -0,0 +1,22 @@ +test alias-analysis +set opt_level=speed +target aarch64 + +;; Check that aliasing properly respects the last store in each +;; "category" separately. + +function %f0(i64, i64) -> i32, i32 { + +block0(v0: i64, v1: i64): + v2 = iconst.i32 42 + v3 = iconst.i32 43 + store.i32 heap v2, v0+8 + store.i32 table v3, v1+8 + + v4 = load.i32 heap v0+8 + v5 = load.i32 table v1+8 + ; check: v4 -> v2 + ; check: v5 -> v3 + + return v4, v5 +} diff --git a/cranelift/filetests/filetests/alias/extends.clif b/cranelift/filetests/filetests/alias/extends.clif new file mode 100644 index 000000000000..d6bbf7d4a837 --- /dev/null +++ b/cranelift/filetests/filetests/alias/extends.clif @@ -0,0 +1,44 @@ +test alias-analysis +set opt_level=speed +target aarch64 + +;; Test that extension modes are properly accounted for when deciding +;; whether loads alias. + +function %f0(i64 vmctx, i32) -> i32, i32, i32, i64, i64, i64 { + gv0 = vmctx + gv1 = load.i64 notrap readonly aligned gv0+8 + heap0 = static gv1, bound 0x1_0000_0000, offset_guard 0x8000_0000, index_type i32 + +block0(v0: i64, v1: i32): + v2 = heap_addr.i64 heap0, v1, 0 + + ;; Initial load. This will not be reused by anything below, even + ;; though it does access the same address. + v3 = load.i32 v2+8 + + ;; These loads must remain (must not be removed as redundant). + v4 = uload8.i32 v2+8 + ; check: v4 = uload8.i32 v2+8 + v5 = sload8.i32 v2+8 + ; check: v5 = sload8.i32 v2+8 + v6 = load.i64 v2+8 + ; check: v6 = load.i64 v2+8 + + ;; 8-bit store only partially overwrites the address. + istore8 v6, v2+8 + + ;; This must not pick up the store data. + v7 = load.i64 v2+8 + ; check: v7 = load.i64 v2+8 + + ;; Another store, this one non-truncating but actually using an + ;; `i8` value. + v8 = iconst.i8 123 + store.i8 v8, v2+8 + + v9 = load.i64 v2+8 + ; check: v9 = load.i64 v2+8 + + return v3, v4, v5, v6, v7, v9 +} diff --git a/cranelift/filetests/filetests/alias/fence.clif b/cranelift/filetests/filetests/alias/fence.clif new file mode 100644 index 000000000000..3202dbfcd750 --- /dev/null +++ b/cranelift/filetests/filetests/alias/fence.clif @@ -0,0 +1,45 @@ +test alias-analysis +set opt_level=speed +target aarch64 + +;; Test that certain instructions act as fences that inhibit alias +;; analysis to move accesses across them. + +function %f0(i64 vmctx, i32) -> i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 { + gv0 = vmctx + gv1 = load.i64 notrap readonly aligned gv0+8 + heap0 = static gv1, bound 0x1_0000_0000, offset_guard 0x8000_0000, index_type i32 + +block0(v0: i64, v1: i32): + v2 = heap_addr.i64 heap0, v1, 0 + + v3 = load.i32 v2+8 + v4 = load.i32 vmctx v0+16 + + atomic_store.i32 v1, v0 + + v5 = load.i32 vmctx v0+16 + ; check: v5 = load.i32 vmctx v0+16 + + v6 = atomic_cas.i32 v0, v1, v1 + + v7 = load.i32 vmctx v0+16 + ; check: v7 = load.i32 vmctx v0+16 + + fence + + v8 = load.i32 vmctx v0+16 + ; check: v8 = load.i32 vmctx v0+16 + + v9 = atomic_rmw.i32 add v0, v1 + + v10 = load.i32 vmctx v0+16 + ; check: v10 = load.i32 vmctx v0+16 + + v11 = atomic_load.i32 v0 + + v12 = load.i32 vmctx v0+16 + ; check: v12 = load.i32 vmctx v0+16 + + return v3, v4, v5, v6, v7, v8, v9, v10, v11, v12 +} diff --git a/cranelift/filetests/filetests/alias/multiple-blocks.clif b/cranelift/filetests/filetests/alias/multiple-blocks.clif new file mode 100644 index 000000000000..3812c8911fbb --- /dev/null +++ b/cranelift/filetests/filetests/alias/multiple-blocks.clif @@ -0,0 +1,29 @@ +test alias-analysis +set opt_level=speed +target aarch64 + +;; Check RLE across basic blocks. + +function %f0(i64 vmctx, i32) -> i32 { + gv0 = vmctx + gv1 = load.i64 notrap readonly aligned gv0+8 + heap0 = static gv1, bound 0x1_0000_0000, offset_guard 0x8000_0000, index_type i32 + + +block0(v0: i64, v1: i32): + v2 = heap_addr.i64 heap0, v1, 0 + v3 = load.i32 v2+8 + brz v2, block1 + jump block2 + +block1: + v4 = load.i32 v2+8 + ; check: v4 -> v3 + jump block3(v4) + +block2: + jump block3(v3) + +block3(v5: i32): + return v5 +} diff --git a/cranelift/filetests/filetests/alias/partial-redundancy.clif b/cranelift/filetests/filetests/alias/partial-redundancy.clif new file mode 100644 index 000000000000..e869d262f1b5 --- /dev/null +++ b/cranelift/filetests/filetests/alias/partial-redundancy.clif @@ -0,0 +1,35 @@ +test alias-analysis +set opt_level=speed +target aarch64 + +;; A test of partial redundancy: we should *not* RLE when an earlier +;; load to the location is only in one predecessor of multiple. + +function %f0(i64 vmctx, i32) -> i32, i32 { + gv0 = vmctx + gv1 = load.i64 notrap readonly aligned gv0+8 + heap0 = static gv1, bound 0x1_0000_0000, offset_guard 0x8000_0000, index_type i32 + fn0 = %g(i64 vmctx) + +block0(v0: i64, v1: i32): + brz v1, block1 + jump block2 + +block1: + v2 = heap_addr.i64 heap0, v1, 0 + v3 = load.i32 v2+64 + jump block3(v3) + +block2: + v4 = heap_addr.i64 heap0, v1, 0 + v5 = load.i32 v4+128 + jump block3(v5) + +block3(v6: i32): + v7 = heap_addr.i64 heap0, v1, 0 + v8 = load.i32 v7+64 + ;; load should survive: + ; check: v8 = load.i32 v7+64 + return v6, v8 + +} diff --git a/cranelift/filetests/filetests/alias/simple-alias.clif b/cranelift/filetests/filetests/alias/simple-alias.clif new file mode 100644 index 000000000000..9b559bc3e571 --- /dev/null +++ b/cranelift/filetests/filetests/alias/simple-alias.clif @@ -0,0 +1,54 @@ +test alias-analysis +set opt_level=speed +target aarch64 + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Redundant-load elimination +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %f0(i64 vmctx, i32) -> i32, i32, i32, i32 { + gv0 = vmctx + gv1 = load.i64 notrap readonly aligned gv0+8 + heap0 = static gv1, bound 0x1_0000_0000, offset_guard 0x8000_0000, index_type i32 + fn0 = %g(i64 vmctx) + +block0(v0: i64, v1: i32): + v2 = heap_addr.i64 heap0, v1, 0 + v3 = load.i32 v2+8 + ;; This should reuse the load above. + v4 = heap_addr.i64 heap0, v1, 0 + v5 = load.i32 v4+8 + ; check: v5 -> v3 + + call fn0(v0) + + ;; The second load is redundant wrt the first, but the call above + ;; is a barrier that prevents reusing v3 or v5. + v6 = load.i32 v4+8 + v7 = load.i32 v4+8 + ; check: v7 -> v6 + + return v3, v5, v6, v7 +} + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; Store-to-load forwarding +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +function %f1(i64 vmctx, i32) -> i32 { + gv0 = vmctx + gv1 = load.i64 notrap readonly aligned gv0+8 + heap0 = static gv1, bound 0x1_0000_0000, offset_guard 0x8000_0000, index_type i32 + fn0 = %g(i64 vmctx) + +block0(v0: i64, v1: i32): + v2 = heap_addr.i64 heap0, v1, 0 + store.i32 v1, v2+8 + + ;; This load should pick up the store above. + v3 = heap_addr.i64 heap0, v1, 0 + v4 = load.i32 v3+8 + ; check: v4 -> v1 + + return v4 +} diff --git a/cranelift/filetests/src/lib.rs b/cranelift/filetests/src/lib.rs index 133d61062187..351d2097e493 100644 --- a/cranelift/filetests/src/lib.rs +++ b/cranelift/filetests/src/lib.rs @@ -37,6 +37,7 @@ mod runone; mod runtest_environment; mod subtest; +mod test_alias_analysis; mod test_cat; mod test_compile; mod test_dce; @@ -111,6 +112,7 @@ pub fn run_passes( /// a `.clif` test file. fn new_subtest(parsed: &TestCommand) -> anyhow::Result> { match parsed.command { + "alias-analysis" => test_alias_analysis::subtest(parsed), "cat" => test_cat::subtest(parsed), "compile" => test_compile::subtest(parsed), "dce" => test_dce::subtest(parsed), diff --git a/cranelift/filetests/src/test_alias_analysis.rs b/cranelift/filetests/src/test_alias_analysis.rs new file mode 100644 index 000000000000..8d155811ba1c --- /dev/null +++ b/cranelift/filetests/src/test_alias_analysis.rs @@ -0,0 +1,48 @@ +//! Test command for testing the alias analysis pass. +//! +//! The `alias-analysis` test command runs each function through GVN +//! and then alias analysis after ensuring that all instructions are +//! legal for the target. +//! +//! The resulting function is sent to `filecheck`. + +use crate::subtest::{run_filecheck, Context, SubTest}; +use cranelift_codegen; +use cranelift_codegen::ir::Function; +use cranelift_reader::TestCommand; +use std::borrow::Cow; + +struct TestAliasAnalysis; + +pub fn subtest(parsed: &TestCommand) -> anyhow::Result> { + assert_eq!(parsed.command, "alias-analysis"); + if !parsed.options.is_empty() { + anyhow::bail!("No options allowed on {}", parsed); + } + Ok(Box::new(TestAliasAnalysis)) +} + +impl SubTest for TestAliasAnalysis { + fn name(&self) -> &'static str { + "alias-analysis" + } + + fn is_mutating(&self) -> bool { + true + } + + fn run(&self, func: Cow, context: &Context) -> anyhow::Result<()> { + let mut comp_ctx = cranelift_codegen::Context::for_function(func.into_owned()); + + comp_ctx.flowgraph(); + comp_ctx + .simple_gvn(context.flags_or_isa()) + .map_err(|e| crate::pretty_anyhow_error(&comp_ctx.func, Into::into(e)))?; + comp_ctx + .replace_redundant_loads() + .map_err(|e| crate::pretty_anyhow_error(&comp_ctx.func, Into::into(e)))?; + + let text = comp_ctx.func.display().to_string(); + run_filecheck(&text, context) + } +} diff --git a/cranelift/wasm/src/code_translator.rs b/cranelift/wasm/src/code_translator.rs index 9276b3f7c0df..2edc80d22d4c 100644 --- a/cranelift/wasm/src/code_translator.rs +++ b/cranelift/wasm/src/code_translator.rs @@ -160,7 +160,9 @@ pub fn translate_operator( GlobalVariable::Const(val) => val, GlobalVariable::Memory { gv, offset, ty } => { let addr = builder.ins().global_value(environ.pointer_type(), gv); - let flags = ir::MemFlags::trusted(); + let mut flags = ir::MemFlags::trusted(); + // Put globals in the "table" abstract heap category as well. + flags.set_table(); builder.ins().load(ty, flags, addr, offset) } GlobalVariable::Custom => environ.translate_custom_global_get( @@ -175,7 +177,9 @@ pub fn translate_operator( GlobalVariable::Const(_) => panic!("global #{} is a constant", *global_index), GlobalVariable::Memory { gv, offset, ty } => { let addr = builder.ins().global_value(environ.pointer_type(), gv); - let flags = ir::MemFlags::trusted(); + let mut flags = ir::MemFlags::trusted(); + // Put globals in the "table" abstract heap category as well. + flags.set_table(); let mut val = state.pop1(); // Ensure SIMD values are cast to their default Cranelift type, I8x16. if ty.is_vector() { @@ -2349,6 +2353,12 @@ fn prepare_addr( let mut flags = MemFlags::new(); flags.set_endianness(ir::Endianness::Little); + // The access occurs to the `heap` disjoint category of abstract + // state. This may allow alias analysis to merge redundant loads, + // etc. when heap accesses occur interleaved with other (table, + // vmctx, stack) accesses. + flags.set_heap(); + Ok((flags, addr, offset.into())) } diff --git a/crates/cranelift/src/func_environ.rs b/crates/cranelift/src/func_environ.rs index d30917294dab..1ceeec6b0a5d 100644 --- a/crates/cranelift/src/func_environ.rs +++ b/crates/cranelift/src/func_environ.rs @@ -279,8 +279,7 @@ impl<'module_environment> FuncEnvironment<'module_environment> { let vmctx = self.vmctx(&mut pos.func); let base = pos.ins().global_value(pointer_type, vmctx); - let mut mem_flags = ir::MemFlags::trusted(); - mem_flags.set_readonly(); + let mem_flags = ir::MemFlags::trusted().with_readonly(); // Load the base of the array of builtin functions let array_offset = i32::try_from(self.offsets.vmctx_builtin_functions()).unwrap(); @@ -766,9 +765,8 @@ impl<'module_environment> FuncEnvironment<'module_environment> { // if null, we take a slow-path that invokes a // libcall. let table_entry_addr = builder.ins().table_addr(pointer_type, table, index, 0); - let value = builder - .ins() - .load(pointer_type, ir::MemFlags::trusted(), table_entry_addr, 0); + let flags = ir::MemFlags::trusted().with_table(); + let value = builder.ins().load(pointer_type, flags, table_entry_addr, 0); // Mask off the "initialized bit". See documentation on // FUNCREF_INIT_BIT in crates/environ/src/ref_bits.rs for more // details. @@ -979,10 +977,8 @@ impl<'module_environment> cranelift_wasm::FuncEnvironment for FuncEnvironment<'m // Load the table element. let elem_addr = builder.ins().table_addr(pointer_type, table, index, 0); - let elem = - builder - .ins() - .load(reference_type, ir::MemFlags::trusted(), elem_addr, 0); + let flags = ir::MemFlags::trusted().with_table(); + let elem = builder.ins().load(reference_type, flags, elem_addr, 0); let elem_is_null = builder.ins().is_null(elem); builder.ins().brnz(elem_is_null, continue_block, &[]); @@ -1087,12 +1083,10 @@ impl<'module_environment> cranelift_wasm::FuncEnvironment for FuncEnvironment<'m let value_with_init_bit = builder .ins() .bor_imm(value, Imm64::from(FUNCREF_INIT_BIT as i64)); - builder.ins().store( - ir::MemFlags::trusted(), - value_with_init_bit, - table_entry_addr, - 0, - ); + let flags = ir::MemFlags::trusted().with_table(); + builder + .ins() + .store(flags, value_with_init_bit, table_entry_addr, 0); Ok(()) } }, @@ -1172,13 +1166,9 @@ impl<'module_environment> cranelift_wasm::FuncEnvironment for FuncEnvironment<'m // saving a reference to a deallocated object, and then using it // after its been freed). builder.switch_to_block(check_current_elem_block); - let current_elem = - builder - .ins() - .load(pointer_type, ir::MemFlags::trusted(), table_entry_addr, 0); - builder - .ins() - .store(ir::MemFlags::trusted(), value, table_entry_addr, 0); + let flags = ir::MemFlags::trusted().with_table(); + let current_elem = builder.ins().load(pointer_type, flags, table_entry_addr, 0); + builder.ins().store(flags, value, table_entry_addr, 0); // If the current element is non-null, decrement its reference // count. And if its reference count has reached zero, then make @@ -1561,8 +1551,7 @@ impl<'module_environment> cranelift_wasm::FuncEnvironment for FuncEnvironment<'m // `*mut VMCallerCheckedAnyfunc` base pointer from `VMContext` // and then loading, based on `SignatureIndex`, the // corresponding entry. - let mut mem_flags = ir::MemFlags::trusted(); - mem_flags.set_readonly(); + let mem_flags = ir::MemFlags::trusted().with_readonly(); let signatures = builder.ins().load( pointer_type, mem_flags,