Skip to content

Commit

Permalink
Reapply "AMDGPU: Move attributor into optimization pipeline (#83131)"…
Browse files Browse the repository at this point in the history
… and follow up commit "clang/AMDGPU: Defeat attribute optimization in attribute test" (#98851)

This reverts commit adaff46.

Drop the -O3 checks from default-attributes.hip. I don't know why they
are different on some bots but reverting this is far too disruptive.
  • Loading branch information
arsenm committed Jul 15, 2024
1 parent 71051de commit b1bcb7c
Show file tree
Hide file tree
Showing 562 changed files with 86,435 additions and 90,198 deletions.
49 changes: 19 additions & 30 deletions clang/test/CodeGenHIP/default-attributes.hip
Original file line number Diff line number Diff line change
Expand Up @@ -2,55 +2,44 @@
// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device \
// RUN: -emit-llvm -o - %s | FileCheck -check-prefix=OPTNONE %s

// RUN: %clang_cc1 -O3 -triple amdgcn-amd-amdhsa -x hip -fno-ident -fcuda-is-device \
// RUN: -emit-llvm -o - %s | FileCheck -check-prefix=OPT %s

#define __device__ __attribute__((device))
#define __global__ __attribute__((global))

//.
// OPTNONE: @__hip_cuid_ = addrspace(1) global i8 0
// OPTNONE: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(1) @__hip_cuid_ to ptr)], section "llvm.metadata"
// OPTNONE: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
//.
__device__ void extern_func();

// OPTNONE: Function Attrs: convergent mustprogress noinline nounwind optnone
// OPTNONE-LABEL: define {{[^@]+}}@_Z4funcv
// OPTNONE-SAME: () #[[ATTR0:[0-9]+]] {
// OPTNONE-NEXT: entry:
// OPTNONE-NEXT: call void @_Z11extern_funcv() #[[ATTR3:[0-9]+]]
// OPTNONE-NEXT: ret void
//
// OPT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
// OPT-LABEL: define {{[^@]+}}@_Z4funcv
// OPT-SAME: () local_unnamed_addr #[[ATTR0:[0-9]+]] {
// OPT-NEXT: entry:
// OPT-NEXT: ret void
//
__device__ void func() {

extern_func();
}

// OPTNONE: Function Attrs: convergent mustprogress noinline norecurse nounwind optnone
// OPTNONE-LABEL: define {{[^@]+}}@_Z6kernelv
// OPTNONE-SAME: () #[[ATTR1:[0-9]+]] {
// OPTNONE-SAME: () #[[ATTR2:[0-9]+]] {
// OPTNONE-NEXT: entry:
// OPTNONE-NEXT: call void @_Z11extern_funcv() #[[ATTR3]]
// OPTNONE-NEXT: ret void
//
// OPT: Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(none)
// OPT-LABEL: define {{[^@]+}}@_Z6kernelv
// OPT-SAME: () local_unnamed_addr #[[ATTR1:[0-9]+]] {
// OPT-NEXT: entry:
// OPT-NEXT: ret void
//
__global__ void kernel() {

extern_func();
}
//.
// OPTNONE: attributes #0 = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
// OPTNONE: attributes #1 = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
//.
// OPT: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
// OPT: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
//.
// OPTNONE: !0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
// OPTNONE: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
// OPTNONE: !2 = !{i32 1, !"wchar_size", i32 4}
// OPTNONE: attributes #[[ATTR0]] = { convergent mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
// OPTNONE: attributes #[[ATTR1:[0-9]+]] = { convergent nounwind "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
// OPTNONE: attributes #[[ATTR2]] = { convergent mustprogress noinline norecurse nounwind optnone "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
// OPTNONE: attributes #[[ATTR3]] = { convergent nounwind }
//.
// OPT: !0 = !{i32 1, !"amdhsa_code_object_version", i32 500}
// OPT: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
// OPT: !2 = !{i32 1, !"wchar_size", i32 4}
// OPTNONE: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
// OPTNONE: [[META1:![0-9]+]] = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
// OPTNONE: [[META2:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
//.
4 changes: 4 additions & 0 deletions llvm/docs/ReleaseNotes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,10 @@ Changes to the AMDGPU Backend
:ref:`atomicrmw <i_atomicrmw>` instruction with `fadd`, `fmin` and
`fmax` with addrspace(3) instead.

* AMDGPUAttributor is no longer run as part of the codegen pass
pipeline. It is expected to run as part of the middle end
optimizations.

Changes to the ARM Backend
--------------------------

Expand Down
13 changes: 8 additions & 5 deletions llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -731,6 +731,14 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) {
PM.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
});

// FIXME: Why is AMDGPUAttributor not in CGSCC?
PB.registerOptimizerLastEPCallback(
[this](ModulePassManager &MPM, OptimizationLevel Level) {
if (Level != OptimizationLevel::O0) {
MPM.addPass(AMDGPUAttributorPass(*this));
}
});

PB.registerFullLinkTimeOptimizationLastEPCallback(
[this](ModulePassManager &PM, OptimizationLevel Level) {
// We want to support the -lto-partitions=N option as "best effort".
Expand Down Expand Up @@ -1037,11 +1045,6 @@ void AMDGPUPassConfig::addIRPasses() {
addPass(createAMDGPULowerModuleLDSLegacyPass(&TM));
}

// AMDGPUAttributor infers lack of llvm.amdgcn.lds.kernel.id calls, so run
// after their introduction
if (TM.getOptLevel() > CodeGenOptLevel::None)
addPass(createAMDGPUAttributorLegacyPass());

if (TM.getOptLevel() > CodeGenOptLevel::None)
addPass(createInferAddressSpacesPass());

Expand Down
6 changes: 6 additions & 0 deletions llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -679,6 +679,12 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
break;
}
}

// FIXME: We can spill incoming arguments and restore at the end of the
// prolog.
if (!ScratchWaveOffsetReg)
report_fatal_error(
"could not find temporary scratch offset register in prolog");
} else {
ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
}
Expand Down
16 changes: 8 additions & 8 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/addsubu64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,8 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11-LABEL: s_add_u64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_add_u32 s0, s6, s0
Expand All @@ -22,8 +22,8 @@ define amdgpu_kernel void @s_add_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12-LABEL: s_add_u64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_add_nc_u64 s[0:1], s[6:7], s[0:1]
Expand Down Expand Up @@ -58,8 +58,8 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX11-LABEL: s_sub_u64:
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_clause 0x1
; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX11-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX11-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX11-NEXT: v_mov_b32_e32 v2, 0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_sub_u32 s0, s6, s0
Expand All @@ -74,8 +74,8 @@ define amdgpu_kernel void @s_sub_u64(ptr addrspace(1) %out, i64 %a, i64 %b) {
; GFX12-LABEL: s_sub_u64:
; GFX12: ; %bb.0: ; %entry
; GFX12-NEXT: s_clause 0x1
; GFX12-NEXT: s_load_b128 s[4:7], s[0:1], 0x24
; GFX12-NEXT: s_load_b64 s[0:1], s[0:1], 0x34
; GFX12-NEXT: s_load_b128 s[4:7], s[2:3], 0x24
; GFX12-NEXT: s_load_b64 s[0:1], s[2:3], 0x34
; GFX12-NEXT: v_mov_b32_e32 v2, 0
; GFX12-NEXT: s_wait_kmcnt 0x0
; GFX12-NEXT: s_sub_nc_u64 s[0:1], s[6:7], s[0:1]
Expand Down
Loading

0 comments on commit b1bcb7c

Please sign in to comment.