Skip to content

Commit

Permalink
[CSSPGO] Unblock optimizations with pseudo probe instrumentation.
Browse files Browse the repository at this point in the history
The IR/MIR pseudo probe intrinsics don't get materialized into real machine instructions and therefore they don't incur runtime cost directly. However, they come with indirect cost by blocking certain optimizations. Some of the blocking are intentional (such as blocking code merge) for better counts quality while the others are accidental. This change unblocks perf-critical optimizations that do not affect counts quality. They include:

1. IR InstCombine, sinking load operation to shorten lifetimes.
2. MIR LiveRangeShrink, similar to kokkos#1
3. MIR TwoAddressInstructionPass, i.e, opeq transform
4. MIR function argument copy elision
5. IR stack protection. (though not perf-critical but nice to have).

Reviewed By: wmi

Differential Revision: https://reviews.llvm.org/D95982
  • Loading branch information
htyu authored and tstellar committed Feb 20, 2021
1 parent 1071279 commit e8e45f5
Show file tree
Hide file tree
Showing 15 changed files with 209 additions and 13 deletions.
7 changes: 7 additions & 0 deletions llvm/include/llvm/CodeGen/MachineInstr.h
Original file line number Diff line number Diff line change
Expand Up @@ -1156,6 +1156,10 @@ class MachineInstr
return getOpcode() == TargetOpcode::CFI_INSTRUCTION;
}

bool isPseudoProbe() const {
return getOpcode() == TargetOpcode::PSEUDO_PROBE;
}

// True if the instruction represents a position in the function.
bool isPosition() const { return isLabel() || isCFIInstruction(); }

Expand All @@ -1165,6 +1169,9 @@ class MachineInstr
bool isDebugInstr() const {
return isDebugValue() || isDebugLabel() || isDebugRef();
}
bool isDebugOrPseudoInstr() const {
return isDebugInstr() || isPseudoProbe();
}

bool isDebugOffsetImm() const { return getDebugOffset().isImm(); }

Expand Down
3 changes: 3 additions & 0 deletions llvm/include/llvm/IR/Instruction.h
Original file line number Diff line number Diff line change
Expand Up @@ -654,6 +654,9 @@ class Instruction : public User,
/// llvm.lifetime.end marker.
bool isLifetimeStartOrEnd() const;

/// Return true if the instruction is a DbgInfoIntrinsic or PseudoProbeInst.
bool isDebugOrPseudoInst() const;

/// Return a pointer to the next non-debug instruction in the same basic
/// block as 'this', or nullptr if no such instruction exists. Skip any pseudo
/// operations if \c SkipPseudoOp is true.
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/CodeGen/LiveRangeShrink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -156,7 +156,8 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
// If MI has side effects, it should become a barrier for code motion.
// IOM is rebuild from the next instruction to prevent later
// instructions from being moved before this MI.
if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) {
if (MI.hasUnmodeledSideEffects() && !MI.isPseudoProbe() &&
Next != MBB.end()) {
BuildInstOrderMap(Next, IOM);
SawStore = false;
}
Expand Down
3 changes: 2 additions & 1 deletion llvm/lib/CodeGen/MachineInstr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1462,7 +1462,8 @@ bool MachineInstr::hasUnmodeledSideEffects() const {
}

bool MachineInstr::isLoadFoldBarrier() const {
return mayStore() || isCall() || hasUnmodeledSideEffects();
return mayStore() || isCall() ||
(hasUnmodeledSideEffects() && !isPseudoProbe());
}

/// allDefsAreDead - Return true if all the defs of this instruction are dead.
Expand Down
5 changes: 3 additions & 2 deletions llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9660,8 +9660,9 @@ findArgumentCopyElisionCandidates(const DataLayout &DL,
// We will look through cast uses, so ignore them completely.
if (I.isCast())
continue;
// Ignore debug info intrinsics, they don't escape or store to allocas.
if (isa<DbgInfoIntrinsic>(I))
// Ignore debug info and pseudo op intrinsics, they don't escape or store
// to allocas.
if (I.isDebugOrPseudoInst())
continue;
// This is an unknown instruction. Assume it escapes or writes to all
// static alloca operands.
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/CodeGen/StackProtector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ bool StackProtector::HasAddressTaken(const Instruction *AI,
// Ignore intrinsics that do not become real instructions.
// TODO: Narrow this to intrinsics that have store-like effects.
const auto *CI = cast<CallInst>(I);
if (!isa<DbgInfoIntrinsic>(CI) && !CI->isLifetimeStartOrEnd())
if (!CI->isDebugOrPseudoInst() && !CI->isLifetimeStartOrEnd())
return true;
break;
}
Expand Down
8 changes: 4 additions & 4 deletions llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -801,8 +801,8 @@ bool TwoAddressInstructionPass::rescheduleMIBelowKill(
MachineBasicBlock::iterator KillPos = KillMI;
++KillPos;
for (MachineInstr &OtherMI : make_range(End, KillPos)) {
// Debug instructions cannot be counted against the limit.
if (OtherMI.isDebugInstr())
// Debug or pseudo instructions cannot be counted against the limit.
if (OtherMI.isDebugOrPseudoInstr())
continue;
if (NumVisited > 10) // FIXME: Arbitrary limit to reduce compile time cost.
return false;
Expand Down Expand Up @@ -974,8 +974,8 @@ bool TwoAddressInstructionPass::rescheduleKillAboveMI(
unsigned NumVisited = 0;
for (MachineInstr &OtherMI :
make_range(mi, MachineBasicBlock::iterator(KillMI))) {
// Debug instructions cannot be counted against the limit.
if (OtherMI.isDebugInstr())
// Debug or pseudo instructions cannot be counted against the limit.
if (OtherMI.isDebugOrPseudoInstr())
continue;
if (NumVisited > 10) // FIXME: Arbitrary limit to reduce compile time cost.
return false;
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/IR/Instruction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,10 @@ bool Instruction::isLifetimeStartOrEnd() const {
return ID == Intrinsic::lifetime_start || ID == Intrinsic::lifetime_end;
}

bool Instruction::isDebugOrPseudoInst() const {
return isa<DbgInfoIntrinsic>(this) || isa<PseudoProbeInst>(this);
}

const Instruction *
Instruction::getNextNonDebugInstruction(bool SkipPseudoOp) const {
for (const Instruction *I = getNextNode(); I; I = I->getNextNode())
Expand Down
7 changes: 7 additions & 0 deletions llvm/lib/Transforms/IPO/FunctionAttrs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,13 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
if (isNoModRef(MRI))
continue;

// A pseudo probe call shouldn't change any function attribute since it
// doesn't translate to a real instruction. It comes with a memory access
// tag to prevent itself being removed by optimizations and not block
// other instructions being optimized.
if (isa<PseudoProbeInst>(I))
continue;

if (!AliasAnalysis::onlyAccessesArgPointees(MRB)) {
// The call could access any memory. If that includes writes, note it.
if (isModSet(MRI))
Expand Down
8 changes: 7 additions & 1 deletion llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -592,8 +592,14 @@ static bool isSafeAndProfitableToSinkLoad(LoadInst *L) {
BasicBlock::iterator BBI = L->getIterator(), E = L->getParent()->end();

for (++BBI; BBI != E; ++BBI)
if (BBI->mayWriteToMemory())
if (BBI->mayWriteToMemory()) {
// Calls that only access inaccessible memory do not block sinking the
// load.
if (auto *CB = dyn_cast<CallBase>(BBI))
if (CB->onlyAccessesInaccessibleMemory())
continue;
return false;
}

// Check for non-address taken alloca. If not address-taken already, it isn't
// profitable to do this xform.
Expand Down
7 changes: 4 additions & 3 deletions llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3878,9 +3878,10 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
}
}

// Skip processing debug intrinsics in InstCombine. Processing these call instructions
// consumes non-trivial amount of time and provides no value for the optimization.
if (!isa<DbgInfoIntrinsic>(Inst)) {
// Skip processing debug and pseudo intrinsics in InstCombine. Processing
// these call instructions consumes non-trivial amount of time and
// provides no value for the optimization.
if (!Inst->isDebugOrPseudoInst()) {
InstrsForInstCombineWorklist.push_back(Inst);
SeenAliasScopes.analyse(Inst);
}
Expand Down
66 changes: 66 additions & 0 deletions llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
; RUN: opt -passes=instcombine -S < %s | FileCheck %s

%struct.nonbonded = type { [2 x %struct.CompAtom*], [2 x %struct.CompAtomExt*], [2 x %struct.CompAtom*], [2 x %class.Vector*], [2 x %class.Vector*], [2 x i32], %class.Vector, double*, double*, %class.ComputeNonbondedWorkArrays*, %class.Pairlists*, i32, i32, double, double, i32, i32, i32, i32 }
%struct.CompAtomExt = type { i32 }
%struct.CompAtom = type { %class.Vector, float, i16, i8, i8 }
%class.Vector = type { double, double, double }
%class.ComputeNonbondedWorkArrays = type { %class.ResizeArray, %class.ResizeArray.0, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray, %class.ResizeArray.2, %class.ResizeArray.2 }
%class.ResizeArray.0 = type { i32 (...)**, %class.ResizeArrayRaw.1* }
%class.ResizeArrayRaw.1 = type <{ double*, i8*, i32, i32, i32, float, i32, [4 x i8] }>
%class.ResizeArray = type { i32 (...)**, %class.ResizeArrayRaw* }
%class.ResizeArrayRaw = type <{ i16*, i8*, i32, i32, i32, float, i32, [4 x i8] }>
%class.ResizeArray.2 = type { i32 (...)**, %class.ResizeArrayRaw.3* }
%class.ResizeArrayRaw.3 = type <{ %class.Vector*, i8*, i32, i32, i32, float, i32, [4 x i8] }>
%class.Pairlists = type { i16*, i32, i32 }

;; Check the minPart4 and minPart assignments are merged.
; CHECK-COUNT-1: getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
; CHECK-NOT: getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16

define dso_local void @_ZN20ComputeNonbondedUtil9calc_pairEP9nonbonded(%struct.nonbonded* nocapture readonly %params) local_unnamed_addr align 2 {
entry:
%savePairlists3 = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 11
%0 = load i32, i32* %savePairlists3, align 8
%usePairlists4 = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 12
%1 = load i32, i32* %usePairlists4, align 4
%tobool54.not = icmp eq i32 %0, 0
br i1 %tobool54.not, label %lor.lhs.false55, label %if.end109

lor.lhs.false55: ; preds = %entry
%tobool56.not = icmp eq i32 %1, 0
br i1 %tobool56.not, label %if.end109, label %if.end109.thread

if.end109.thread: ; preds = %lor.lhs.false55
%minPart4 = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
%2 = load i32, i32* %minPart4, align 4
call void @llvm.pseudoprobe(i64 -6172701105289426098, i64 2, i32 0, i64 -1)
br label %if.then138

if.end109: ; preds = %lor.lhs.false55, %entry
%minPart = getelementptr inbounds %struct.nonbonded, %struct.nonbonded* %params, i64 0, i32 16
%3 = load i32, i32* %minPart, align 4
call void @llvm.pseudoprobe(i64 -6172701105289426098, i64 3, i32 0, i64 -1)
%tobool116.not = icmp eq i32 %1, 0
br i1 %tobool116.not, label %if.then117, label %if.then138

if.then117: ; preds = %if.end109
ret void

if.then138: ; preds = %if.end109.thread, %if.end109
%4 = phi i32 [ %2, %if.end109.thread ], [ %3, %if.end109 ]
%tobool139.not = icmp eq i32 %4, 0
br i1 %tobool139.not, label %if.else147, label %if.then140

if.then140: ; preds = %if.then138
ret void

if.else147: ; preds = %if.then138
ret void
}

declare dso_local void @_ZN9Pairlists8addIndexEv() align 2

; Function Attrs: inaccessiblememonly nounwind willreturn
declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0

attributes #0 = { inaccessiblememonly nounwind willreturn }
33 changes: 33 additions & 0 deletions llvm/test/Transforms/SampleProfile/pseudo-probe-instsched.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
; PR1075
; RUN: llc < %s -mcpu=generic -mtriple=x86_64-apple-darwin -pseudo-probe-for-profiling -O3 | FileCheck %s

define float @foo(float %x) #0 {
%tmp1 = fmul float %x, 3.000000e+00
%tmp3 = fmul float %x, 5.000000e+00
%tmp5 = fmul float %x, 7.000000e+00
%tmp7 = fmul float %x, 1.100000e+01
call void @llvm.pseudoprobe(i64 6699318081062747564, i64 1, i32 0, i64 -1)
%tmp10 = fadd float %tmp1, %tmp3
%tmp12 = fadd float %tmp10, %tmp5
%tmp14 = fadd float %tmp12, %tmp7
ret float %tmp14
; CHECK: mulss
; CHECK: mulss
; CHECK: addss
; CHECK: mulss
; CHECK: addss
; CHECK: mulss
; CHECK: addss
; CHECK: ret
}

; Function Attrs: inaccessiblememonly nounwind willreturn
declare void @llvm.pseudoprobe(i64, i64, i32, i64) #1

attributes #0 = { nounwind }
attributes #1 = { inaccessiblememonly nounwind willreturn }

!llvm.pseudo_probe_desc = !{!0}

!0 = !{i64 6699318081062747564, i64 4294967295, !"foo", null}

29 changes: 29 additions & 0 deletions llvm/test/Transforms/SampleProfile/pseudo-probe-peep.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
; RUN: llc -mtriple=x86_64-- -stop-after=peephole-opt -o - %s | FileCheck %s

define internal i32 @arc_compare() {
entry:
%0 = load i64, i64* undef, align 8
br i1 undef, label %return, label %if.end

if.end: ; preds = %entry
; Chek a register copy has been sinked into the compare instruction.
; CHECK: %[[#REG:]]:gr64 = IMPLICIT_DEF
; CHECK-NOT: %[[#]]:gr64 = MOV64rm %[[#REG]]
; CHECK: PSEUDO_PROBE 5116412291814990879, 3, 0, 0
; CHECK: CMP64mr %[[#REG]], 1
call void @llvm.pseudoprobe(i64 5116412291814990879, i64 3, i32 0, i64 -1)
%cmp4 = icmp slt i64 %0, undef
br i1 %cmp4, label %return, label %if.end6

if.end6: ; preds = %if.end
call void @llvm.pseudoprobe(i64 5116412291814990879, i64 5, i32 0, i64 -1)
br label %return

return: ; preds = %if.end6, %if.end, %entry
ret i32 undef
}

; Function Attrs: inaccessiblememonly nounwind willreturn
declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0

attributes #0 = { inaccessiblememonly nounwind willreturn }
37 changes: 37 additions & 0 deletions llvm/test/Transforms/SampleProfile/pseudo-probe-twoaddr.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
; RUN: llc -stop-after=twoaddressinstruction -mtriple=x86_64-- -o - %s | FileCheck %s


define dso_local double @twoaddressinstruction() local_unnamed_addr {
for.end:
%0 = load i64, i64* undef, align 8
br label %for.body14.preheader

for.body14.preheader: ; preds = %for.end
br i1 undef, label %for.cond25.preheader.loopexit.unr-lcssa, label %for.body14.preheader.new

for.body14.preheader.new: ; preds = %for.body14.preheader
%unroll_iter136 = and i64 %0, -4
br label %for.body14

for.cond25.preheader.loopexit.unr-lcssa: ; preds = %for.body14, %for.body14.preheader
%indvars.iv127.unr = phi i64 [ 1, %for.body14.preheader ], [ %indvars.iv.next128.3, %for.body14 ]
ret double undef

for.body14: ; preds = %for.body14, %for.body14.preheader.new
%indvars.iv127 = phi i64 [ 1, %for.body14.preheader.new ], [ %indvars.iv.next128.3, %for.body14 ]
%niter137 = phi i64 [ %unroll_iter136, %for.body14.preheader.new ], [ %niter137.nsub.3, %for.body14 ]
%indvars.iv.next128.3 = add nuw nsw i64 %indvars.iv127, 4
; CHECK: PSEUDO_PROBE -6878943695821059507, 9, 0, 0
call void @llvm.pseudoprobe(i64 -6878943695821059507, i64 9, i32 0, i64 -1)
;; Check an opeq form of instruction is created.
; CHECK: %[[#REG:]]:gr64_nosp = COPY killed %[[#]]
; CHECK: %[[#REG]]:gr64_nosp = nuw ADD64ri8 %[[#REG]], 4, implicit-def dead $eflags
%niter137.nsub.3 = add i64 %niter137, -4
%niter137.ncmp.3 = icmp eq i64 %niter137.nsub.3, 0
br i1 %niter137.ncmp.3, label %for.cond25.preheader.loopexit.unr-lcssa, label %for.body14
}

; Function Attrs: inaccessiblememonly nounwind willreturn
declare void @llvm.pseudoprobe(i64, i64, i32, i64) #0

attributes #0 = { inaccessiblememonly nounwind willreturn }

0 comments on commit e8e45f5

Please sign in to comment.