diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h index 0656c0d739fdf..d8c9d0a432ad8 100644 --- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h +++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h @@ -15,6 +15,7 @@ #include "llvm/ADT/BitVector.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/Support/TypeSize.h" #include @@ -473,6 +474,11 @@ class TargetFrameLowering { /// Return the frame base information to be encoded in the DWARF subprogram /// debug info. virtual DwarfFrameBase getDwarfFrameBase(const MachineFunction &MF) const; + + /// This method is called at the end of prolog/epilog code insertion, so + /// targets can emit remarks based on the final frame layout. + virtual void emitRemarks(const MachineFunction &MF, + MachineOptimizationRemarkEmitter *ORE) const {}; }; } // End llvm namespace diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp index cd5d877e53d82..f4490873cfdcd 100644 --- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp +++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp @@ -341,6 +341,9 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) { << ore::NV("Function", MF.getFunction().getName()) << "'"; }); + // Emit any remarks implemented for the target, based on final frame layout. + TFI->emitRemarks(MF, ORE); + delete RS; SaveBlocks.clear(); RestoreBlocks.clear(); diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index bd530903bb664..ba46ededc63a8 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -240,6 +240,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/FormatVariadic.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" @@ -275,6 +276,10 @@ cl::opt EnableHomogeneousPrologEpilog( // Stack hazard padding size. 0 = disabled. static cl::opt StackHazardSize("aarch64-stack-hazard-size", cl::init(0), cl::Hidden); +// Stack hazard size for analysis remarks. StackHazardSize takes precedence. +static cl::opt + StackHazardRemarkSize("aarch64-stack-hazard-remark-size", cl::init(0), + cl::Hidden); // Whether to insert padding into non-streaming functions (for testing). static cl::opt StackHazardInNonStreaming("aarch64-stack-hazard-in-non-streaming", @@ -2615,9 +2620,16 @@ AArch64FrameLowering::getFrameIndexReferenceFromSP(const MachineFunction &MF, const auto &MFI = MF.getFrameInfo(); int64_t ObjectOffset = MFI.getObjectOffset(FI); + StackOffset SVEStackSize = getSVEStackSize(MF); + + // For VLA-area objects, just emit an offset at the end of the stack frame. + // Whilst not quite correct, these objects do live at the end of the frame and + // so it is more useful for analysis for the offset to reflect this. + if (MFI.isVariableSizedObjectIndex(FI)) { + return StackOffset::getFixed(-((int64_t)MFI.getStackSize())) - SVEStackSize; + } // This is correct in the absence of any SVE stack objects. - StackOffset SVEStackSize = getSVEStackSize(MF); if (!SVEStackSize) return StackOffset::getFixed(ObjectOffset - getOffsetOfLocalArea()); @@ -3528,13 +3540,9 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters( return true; } -// Return the FrameID for a Load/Store instruction by looking at the MMO. -static std::optional getLdStFrameID(const MachineInstr &MI, - const MachineFrameInfo &MFI) { - if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1) - return std::nullopt; - - MachineMemOperand *MMO = *MI.memoperands_begin(); +// Return the FrameID for a MMO. +static std::optional getMMOFrameID(MachineMemOperand *MMO, + const MachineFrameInfo &MFI) { auto *PSV = dyn_cast_or_null(MMO->getPseudoValue()); if (PSV) @@ -3552,6 +3560,15 @@ static std::optional getLdStFrameID(const MachineInstr &MI, return std::nullopt; } +// Return the FrameID for a Load/Store instruction by looking at the first MMO. +static std::optional getLdStFrameID(const MachineInstr &MI, + const MachineFrameInfo &MFI) { + if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1) + return std::nullopt; + + return getMMOFrameID(*MI.memoperands_begin(), MFI); +} + // Check if a Hazard slot is needed for the current function, and if so create // one for it. The index is stored in AArch64FunctionInfo->StackHazardSlotIndex, // which can be used to determine if any hazard padding is needed. @@ -5029,3 +5046,174 @@ void AArch64FrameLowering::inlineStackProbe(MachineFunction &MF, MI->eraseFromParent(); } } + +struct StackAccess { + enum AccessType { + NotAccessed = 0, // Stack object not accessed by load/store instructions. + GPR = 1 << 0, // A general purpose register. + PPR = 1 << 1, // A predicate register. + FPR = 1 << 2, // A floating point/Neon/SVE register. + }; + + int Idx; + StackOffset Offset; + int64_t Size; + unsigned AccessTypes; + + StackAccess() : Idx(0), Offset(), Size(0), AccessTypes(NotAccessed) {} + + bool operator<(const StackAccess &Rhs) const { + return std::make_tuple(start(), Idx) < + std::make_tuple(Rhs.start(), Rhs.Idx); + } + + bool isCPU() const { + // Predicate register load and store instructions execute on the CPU. + return AccessTypes & (AccessType::GPR | AccessType::PPR); + } + bool isSME() const { return AccessTypes & AccessType::FPR; } + bool isMixed() const { return isCPU() && isSME(); } + + int64_t start() const { return Offset.getFixed() + Offset.getScalable(); } + int64_t end() const { return start() + Size; } + + std::string getTypeString() const { + switch (AccessTypes) { + case AccessType::FPR: + return "FPR"; + case AccessType::PPR: + return "PPR"; + case AccessType::GPR: + return "GPR"; + case AccessType::NotAccessed: + return "NA"; + default: + return "Mixed"; + } + } + + void print(raw_ostream &OS) const { + OS << getTypeString() << " stack object at [SP" + << (Offset.getFixed() < 0 ? "" : "+") << Offset.getFixed(); + if (Offset.getScalable()) + OS << (Offset.getScalable() < 0 ? "" : "+") << Offset.getScalable() + << " * vscale"; + OS << "]"; + } +}; + +static inline raw_ostream &operator<<(raw_ostream &OS, const StackAccess &SA) { + SA.print(OS); + return OS; +} + +void AArch64FrameLowering::emitRemarks( + const MachineFunction &MF, MachineOptimizationRemarkEmitter *ORE) const { + + SMEAttrs Attrs(MF.getFunction()); + if (Attrs.hasNonStreamingInterfaceAndBody()) + return; + + const uint64_t HazardSize = + (StackHazardSize) ? StackHazardSize : StackHazardRemarkSize; + + if (HazardSize == 0) + return; + + const MachineFrameInfo &MFI = MF.getFrameInfo(); + // Bail if function has no stack objects. + if (!MFI.hasStackObjects()) + return; + + std::vector StackAccesses(MFI.getNumObjects()); + + size_t NumFPLdSt = 0; + size_t NumNonFPLdSt = 0; + + // Collect stack accesses via Load/Store instructions. + for (const MachineBasicBlock &MBB : MF) { + for (const MachineInstr &MI : MBB) { + if (!MI.mayLoadOrStore() || MI.getNumMemOperands() < 1) + continue; + for (MachineMemOperand *MMO : MI.memoperands()) { + std::optional FI = getMMOFrameID(MMO, MFI); + if (FI && !MFI.isDeadObjectIndex(*FI)) { + int FrameIdx = *FI; + + size_t ArrIdx = FrameIdx + MFI.getNumFixedObjects(); + if (StackAccesses[ArrIdx].AccessTypes == StackAccess::NotAccessed) { + StackAccesses[ArrIdx].Idx = FrameIdx; + StackAccesses[ArrIdx].Offset = + getFrameIndexReferenceFromSP(MF, FrameIdx); + StackAccesses[ArrIdx].Size = MFI.getObjectSize(FrameIdx); + } + + unsigned RegTy = StackAccess::AccessType::GPR; + if (MFI.getStackID(FrameIdx) == TargetStackID::ScalableVector) { + if (AArch64::PPRRegClass.contains(MI.getOperand(0).getReg())) + RegTy = StackAccess::PPR; + else + RegTy = StackAccess::FPR; + } else if (AArch64InstrInfo::isFpOrNEON(MI)) { + RegTy = StackAccess::FPR; + } + + StackAccesses[ArrIdx].AccessTypes |= RegTy; + + if (RegTy == StackAccess::FPR) + ++NumFPLdSt; + else + ++NumNonFPLdSt; + } + } + } + } + + if (NumFPLdSt == 0 || NumNonFPLdSt == 0) + return; + + llvm::sort(StackAccesses); + StackAccesses.erase(llvm::remove_if(StackAccesses, + [](const StackAccess &S) { + return S.AccessTypes == + StackAccess::NotAccessed; + }), + StackAccesses.end()); + + SmallVector MixedObjects; + SmallVector> HazardPairs; + + if (StackAccesses.front().isMixed()) + MixedObjects.push_back(&StackAccesses.front()); + + for (auto It = StackAccesses.begin(), End = std::prev(StackAccesses.end()); + It != End; ++It) { + const auto &First = *It; + const auto &Second = *(It + 1); + + if (Second.isMixed()) + MixedObjects.push_back(&Second); + + if ((First.isSME() && Second.isCPU()) || + (First.isCPU() && Second.isSME())) { + uint64_t Distance = static_cast(Second.start() - First.end()); + if (Distance < HazardSize) + HazardPairs.emplace_back(&First, &Second); + } + } + + auto EmitRemark = [&](llvm::StringRef Str) { + ORE->emit([&]() { + auto R = MachineOptimizationRemarkAnalysis( + "sme", "StackHazard", MF.getFunction().getSubprogram(), &MF.front()); + return R << formatv("stack hazard in '{0}': ", MF.getName()).str() << Str; + }); + }; + + for (const auto &P : HazardPairs) + EmitRemark(formatv("{0} is too close to {1}", *P.first, *P.second).str()); + + for (const auto *Obj : MixedObjects) + EmitRemark( + formatv("{0} accessed by both GP and FP instructions", *Obj).str()); +} diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h index 0ebab1700e9ce..c197312496208 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h @@ -13,8 +13,9 @@ #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H #define LLVM_LIB_TARGET_AARCH64_AARCH64FRAMELOWERING_H -#include "llvm/Support/TypeSize.h" +#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h" #include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/Support/TypeSize.h" namespace llvm { @@ -178,6 +179,9 @@ class AArch64FrameLowering : public TargetFrameLowering { inlineStackProbeLoopExactMultiple(MachineBasicBlock::iterator MBBI, int64_t NegProbeSize, Register TargetReg) const; + + void emitRemarks(const MachineFunction &MF, + MachineOptimizationRemarkEmitter *ORE) const override; }; } // End llvm namespace diff --git a/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll new file mode 100644 index 0000000000000..0b6bf3892a0c2 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ssve-stack-hazard-remarks.ll @@ -0,0 +1,152 @@ +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-stack-hazard-remark-size=64 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -pass-remarks-analysis=sme -aarch64-stack-hazard-size=1024 -o /dev/null < %s 2>&1 | FileCheck %s --check-prefixes=CHECK-PADDING + +; Don't emit remarks for non-streaming functions. +define float @csr_x20_stackargs_notsc(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) { +; CHECK-NOT: remark: :0:0: stack hazard in 'csr_x20_stackargs_notsc': +; CHECK-PADDING-NOT: remark: :0:0: stack hazard in 'csr_x20_stackargs_notsc': +entry: + tail call void asm sideeffect "", "~{x20}"() #1 + ret float %i +} + +; Don't emit remarks for functions that only access GPR stack objects. +define i64 @stackargs_gpr(i64 %a, i64 %b, i64 %c, i64 %d, i64 %e, i64 %f, i64 %g, i64 %h, i64 %i) #2 { +; CHECK-NOT: remark: :0:0: stack hazard in 'csr_x20_stackargs_gpr': +; CHECK-PADDING-NOT: remark: :0:0: stack hazard in 'csr_x20_stackargs_gpr': +entry: + ret i64 %i +} + +; Don't emit remarks for functions that only access FPR stack objects. +define double @stackargs_fpr(double %a, double %b, double %c, double %d, double %e, double %f, double %g, double %h, double %i) #2 { +; CHECK-NOT: remark: :0:0: stack hazard in 'csr_x20_stackargs_fpr': +; CHECK-PADDING-NOT: remark: :0:0: stack hazard in 'csr_x20_stackargs_fpr': +entry: + ret double %i +} + +; As this case is handled by addition of stack hazard padding, only emit remarks when this is not switched on. +define i32 @csr_d8_alloci64(i64 %d) #2 { +; CHECK: remark: :0:0: stack hazard in 'csr_d8_alloci64': FPR stack object at [SP-16] is too close to GPR stack object at [SP-8] +; CHECK-PADDING-NOT: remark: :0:0: stack hazard in 'csr_d8_alloci64': +entry: + %a = alloca i64 + tail call void asm sideeffect "", "~{d8}"() #1 + store i64 %d, ptr %a + ret i32 0 +} + +; As this case is handled by addition of stack hazard padding, only emit remarks when this is not switched on. +define i32 @csr_d8_allocnxv4i32(i64 %d) #2 { +; CHECK: remark: :0:0: stack hazard in 'csr_d8_allocnxv4i32': FPR stack object at [SP-16] is too close to GPR stack object at [SP-8] +; CHECK-PADDING-NOT: remark: :0:0: stack hazard in 'csr_d8_allocnxv4i32': +entry: + %a = alloca + tail call void asm sideeffect "", "~{d8}"() #1 + store zeroinitializer, ptr %a + ret i32 0 +} + +define float @csr_x20_stackargs(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) #2 { +; CHECK: remark: :0:0: stack hazard in 'csr_x20_stackargs': GPR stack object at [SP-16] is too close to FPR stack object at [SP+0] +; CHECK-PADDING: remark: :0:0: stack hazard in 'csr_x20_stackargs': GPR stack object at [SP-16] is too close to FPR stack object at [SP+0] +entry: + tail call void asm sideeffect "", "~{x20}"() #1 + ret float %i +} + +; In this case, addition of stack hazard padding triggers x29 (fp) spill, so we hazard occurs between FPR argument and GPR spill. +define float @csr_d8_stackargs(float %a, float %b, float %c, float %d, float %e, float %f, float %g, float %h, float %i) #2 { +; CHECK-NOT: remark: :0:0: stack hazard in 'csr_d8_stackargs': +; CHECK-PADDING: remark: :0:0: stack hazard in 'csr_d8_stackargs': GPR stack object at [SP-8] is too close to FPR stack object at [SP+0] +entry: + tail call void asm sideeffect "", "~{d8}"() #1 + ret float %i +} + +; SVE calling conventions +; Predicate register spills end up in FP region, currently. + +define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, i16 %P4) #2 { +; CHECK: remark: :0:0: stack hazard in 'svecc_call': PPR stack object at [SP-48-258 * vscale] is too close to FPR stack object at [SP-48-256 * vscale] +; CHECK: remark: :0:0: stack hazard in 'svecc_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48] +; CHECK-PADDING: remark: :0:0: stack hazard in 'svecc_call': PPR stack object at [SP-1072-258 * vscale] is too close to FPR stack object at [SP-1072-256 * vscale] +; CHECK-PADDING-NOT: remark: :0:0: stack hazard in 'svecc_call': +entry: + tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 + %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37) + ret i32 -396142473 +} + +define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, %P3, i16 %P4) #2 { +; CHECK: remark: :0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-48-258 * vscale] is too close to FPR stack object at [SP-48-256 * vscale] +; CHECK: remark: :0:0: stack hazard in 'svecc_alloca_call': FPR stack object at [SP-48-16 * vscale] is too close to GPR stack object at [SP-48] +; CHECK-PADDING: remark: :0:0: stack hazard in 'svecc_alloca_call': PPR stack object at [SP-1072-258 * vscale] is too close to FPR stack object at [SP-1072-256 * vscale] +; CHECK-PADDING-NOT: remark: :0:0: stack hazard in 'svecc_alloca_call': +entry: + tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 + %0 = alloca [37 x i8], align 16 + %call = call ptr @memset(ptr noundef nonnull %0, i32 noundef 45, i32 noundef 37) + ret i32 -396142473 +} +declare ptr @memset(ptr, i32, i32) + +%struct.mixed_struct = type { i32, float } + +define i32 @mixed_stack_object(i32 %a, float %b) #2 { +; CHECK: remark: :0:0: stack hazard in 'mixed_stack_object': Mixed stack object at [SP-8] accessed by both GP and FP instructions +; CHECK-PADDING: remark: :0:0: stack hazard in 'mixed_stack_object': Mixed stack object at [SP-8] accessed by both GP and FP instructions +entry: + %s = alloca %struct.mixed_struct + %s.i = getelementptr %struct.mixed_struct, ptr %s, i32 0, i32 0 + %s.f = getelementptr %struct.mixed_struct, ptr %s, i32 0, i32 1 + store i32 %a, ptr %s.i + store float %b, ptr %s.f + ret i32 %a +} + +define i32 @mixed_stack_objects(i32 %a, float %b) #2 { +; CHECK: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-16] is too close to Mixed stack object at [SP-8] +; CHECK: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-16] accessed by both GP and FP instructions +; CHECK: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-8] accessed by both GP and FP instructions +; CHECK-PADDING: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-16] is too close to Mixed stack object at [SP-8] +; CHECK-PADDING: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-16] accessed by both GP and FP instructions +; CHECK-PADDING: remark: :0:0: stack hazard in 'mixed_stack_objects': Mixed stack object at [SP-8] accessed by both GP and FP instructions +entry: + %s0 = alloca %struct.mixed_struct + %s0.i = getelementptr %struct.mixed_struct, ptr %s0, i32 0, i32 0 + %s0.f = getelementptr %struct.mixed_struct, ptr %s0, i32 0, i32 1 + store i32 %a, ptr %s0.i + store float %b, ptr %s0.f + + %s1 = alloca %struct.mixed_struct + %s1.i = getelementptr %struct.mixed_struct, ptr %s1, i32 0, i32 0 + %s1.f = getelementptr %struct.mixed_struct, ptr %s1, i32 0, i32 1 + store i32 %a, ptr %s1.i + store float %b, ptr %s1.f + + ret i32 %a +} + +; VLA-area stack objects are not separated. +define i32 @csr_d8_allocnxv4i32i32f64_vlai32f64(double %d, i32 %i) #2 { +; CHECK: remark: :0:0: stack hazard in 'csr_d8_allocnxv4i32i32f64_vlai32f64': GPR stack object at [SP-48-16 * vscale] is too close to FPR stack object at [SP-48-16 * vscale] +; CHECK: remark: :0:0: stack hazard in 'csr_d8_allocnxv4i32i32f64_vlai32f64': FPR stack object at [SP-32] is too close to GPR stack object at [SP-24] +; CHECK-PADDING: remark: :0:0: stack hazard in 'csr_d8_allocnxv4i32i32f64_vlai32f64': GPR stack object at [SP-2096-16 * vscale] is too close to FPR stack object at [SP-2096-16 * vscale] +; CHECK-PADDING-NOT: remark: :0:0: stack hazard in 'csr_d8_allocnxv4i32i32f64_vlai32f64': +entry: + %a = alloca + %0 = zext i32 %i to i64 + %vla0 = alloca i32, i64 %0 + %vla1 = alloca double, i64 %0 + %c = alloca double + tail call void asm sideeffect "", "~{d8}"() #1 + store zeroinitializer, ptr %a + store i32 zeroinitializer, ptr %vla0 + store double %d, ptr %vla1 + store double %d, ptr %c + ret i32 0 +} + +attributes #2 = { "aarch64_pstate_sm_compatible" } diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll index 431c9dc76508f..ec94198a08ca7 100644 --- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll +++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll @@ -150,8 +150,8 @@ entry: ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Variable, Align: 16, Size: vscale x 16 ; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-40-16 x vscale], Type: Variable, Align: 8, Size: 8 -; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: VariableSized, Align: 1, Size: 0 -; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: VariableSized, Align: 1, Size: 0 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-16 x vscale], Type: VariableSized, Align: 1, Size: 0 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48-16 x vscale], Type: VariableSized, Align: 1, Size: 0 define i32 @csr_d8_allocnxv4i32i32f64_vla(double %d, i32 %i) "aarch64_pstate_sm_compatible" { ; CHECK-LABEL: csr_d8_allocnxv4i32i32f64_vla: